github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/defaultIndexingChain.go (about)

     1  package index
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/balzaczyy/golucene/core/analysis"
     6  	. "github.com/balzaczyy/golucene/core/codec/spi"
     7  	. "github.com/balzaczyy/golucene/core/index/model"
     8  	"github.com/balzaczyy/golucene/core/store"
     9  	"github.com/balzaczyy/golucene/core/util"
    10  )
    11  
    12  /* Default general purpose indexing chain, which handles indexing all types of fields */
    13  type DefaultIndexingChain struct {
    14  	bytesUsed  util.Counter
    15  	docState   *docState
    16  	docWriter  *DocumentsWriterPerThread
    17  	fieldInfos *FieldInfosBuilder
    18  
    19  	// Writes postings and term vectors:
    20  	termsHash TermsHash
    21  
    22  	storedFieldsWriter StoredFieldsWriter // lazy init
    23  	lastStoredDocId    int
    24  
    25  	fieldHash []*PerField
    26  	hashMask  int
    27  
    28  	totalFieldCount int
    29  	nextFieldGen    int64
    30  
    31  	// Holds fields seen in each document
    32  	fields []*PerField
    33  }
    34  
    35  func newDefaultIndexingChain(docWriter *DocumentsWriterPerThread) *DefaultIndexingChain {
    36  	termVectorsWriter := newTermVectorsConsumer(docWriter)
    37  	return &DefaultIndexingChain{
    38  		docWriter:  docWriter,
    39  		fieldInfos: docWriter.fieldInfos,
    40  		docState:   docWriter.docState,
    41  		bytesUsed:  docWriter._bytesUsed,
    42  		termsHash:  newFreqProxTermsWriter(docWriter, termVectorsWriter),
    43  		fieldHash:  make([]*PerField, 2),
    44  		hashMask:   1,
    45  		fields:     make([]*PerField, 1),
    46  	}
    47  }
    48  
    49  // TODO: can we remove this lazy-init / make cleaner / do it another way...?
    50  func (c *DefaultIndexingChain) initStoredFieldsWriter() (err error) {
    51  	if c.storedFieldsWriter == nil {
    52  		assert(c != nil)
    53  		assert(c.docWriter != nil)
    54  		assert(c.docWriter.codec != nil)
    55  		assert(c.docWriter.codec.StoredFieldsFormat() != nil)
    56  		c.storedFieldsWriter, err = c.docWriter.codec.StoredFieldsFormat().FieldsWriter(
    57  			c.docWriter.directory, c.docWriter.segmentInfo, store.IO_CONTEXT_DEFAULT)
    58  	}
    59  	return
    60  }
    61  
    62  func (c *DefaultIndexingChain) flush(state *SegmentWriteState) (err error) {
    63  	// NOTE: caller (DWPT) handles aborting on any error from this method
    64  
    65  	numDocs := state.SegmentInfo.DocCount()
    66  	if err = c.writeNorms(state); err != nil {
    67  		return
    68  	}
    69  	if err = c.writeDocValues(state); err != nil {
    70  		return
    71  	}
    72  
    73  	// it's possible all docs hit non-aboritng errors...
    74  	if err = c.initStoredFieldsWriter(); err != nil {
    75  		return
    76  	}
    77  	if err = c.fillStoredFields(numDocs); err != nil {
    78  		return
    79  	}
    80  	if err = c.storedFieldsWriter.Finish(state.FieldInfos, numDocs); err != nil {
    81  		return
    82  	}
    83  	if err = c.storedFieldsWriter.Close(); err != nil {
    84  		return
    85  	}
    86  
    87  	fieldsToFlush := make(map[string]TermsHashPerField)
    88  	for _, perField := range c.fieldHash {
    89  		for perField != nil {
    90  			if perField.invertState != nil {
    91  				fieldsToFlush[perField.fieldInfo.Name] = perField.termsHashPerField
    92  			}
    93  			perField = perField.next
    94  		}
    95  	}
    96  
    97  	if err = c.termsHash.flush(fieldsToFlush, state); err != nil {
    98  		return
    99  	}
   100  
   101  	// important to save after asking consumer to flush so consumer can
   102  	// alter the FieldInfo* if necessary. E.g., FreqProxTermsWriter does
   103  	// this with FieldInfo.storePayload.
   104  	infosWriter := c.docWriter.codec.FieldInfosFormat().FieldInfosWriter()
   105  	return infosWriter(state.Directory, state.SegmentInfo.Name, "", state.FieldInfos, store.IO_CONTEXT_DEFAULT)
   106  }
   107  
   108  /* Writes all buffered doc values (called from flush()) */
   109  func (c *DefaultIndexingChain) writeDocValues(state *SegmentWriteState) (err error) {
   110  	docCount := state.SegmentInfo.DocCount()
   111  	var dvConsumer DocValuesConsumer
   112  	var success = false
   113  	if success {
   114  		err = util.Close(dvConsumer)
   115  	} else {
   116  		util.CloseWhileSuppressingError(dvConsumer)
   117  	}
   118  
   119  	for _, perField := range c.fieldHash {
   120  		for perField != nil {
   121  			if perField.docValuesWriter != nil {
   122  				if dvConsumer == nil {
   123  					// lazy init
   124  					fmt := state.SegmentInfo.Codec().(Codec).DocValuesFormat()
   125  					if dvConsumer, err = fmt.FieldsConsumer(state); err != nil {
   126  						return
   127  					}
   128  				}
   129  
   130  				perField.docValuesWriter.finish(docCount)
   131  				if err = perField.docValuesWriter.flush(state, dvConsumer); err != nil {
   132  					return
   133  				}
   134  				perField.docValuesWriter = nil
   135  			}
   136  			perField = perField.next
   137  		}
   138  	}
   139  
   140  	success = true
   141  	return nil
   142  }
   143  
   144  /*
   145  Catch up for all docs before us that had no stored fields, or hit
   146  non-aborting errors before writing stored fields.
   147  */
   148  func (c *DefaultIndexingChain) fillStoredFields(docId int) (err error) {
   149  	for err == nil && c.lastStoredDocId < docId {
   150  		err = c.startStoredFields()
   151  		if err == nil {
   152  			err = c.finishStoredFields()
   153  		}
   154  	}
   155  	return
   156  }
   157  
   158  func (c *DefaultIndexingChain) writeNorms(state *SegmentWriteState) (err error) {
   159  	var success = false
   160  	var normsConsumer DocValuesConsumer
   161  	defer func() {
   162  		if success {
   163  			err = util.Close(normsConsumer)
   164  		} else {
   165  			util.CloseWhileSuppressingError(normsConsumer)
   166  		}
   167  	}()
   168  
   169  	if state.FieldInfos.HasNorms {
   170  		normsFormat := state.SegmentInfo.Codec().(Codec).NormsFormat()
   171  		assert(normsFormat != nil)
   172  		if normsConsumer, err = normsFormat.NormsConsumer(state); err != nil {
   173  			return
   174  		}
   175  
   176  		for _, fi := range state.FieldInfos.Values {
   177  			perField := c.perField(fi.Name)
   178  			assert(perField != nil)
   179  
   180  			// we must check the final value of omitNorms for the FieldInfo:
   181  			// it could have changed for this field since the first time we
   182  			// added it.
   183  			if !fi.OmitsNorms() {
   184  				if perField.norms != nil {
   185  					perField.norms.finish(state.SegmentInfo.DocCount())
   186  					if err = perField.norms.flush(state, normsConsumer); err != nil {
   187  						return
   188  					}
   189  					assert(fi.NormType() == DOC_VALUES_TYPE_NUMERIC)
   190  				} else if fi.IsIndexed() {
   191  					assert2(fi.NormType() == 0, "got %v; field=%v", fi.NormType(), fi.Name)
   192  				}
   193  			}
   194  		}
   195  	}
   196  	success = true
   197  	return nil
   198  }
   199  
   200  func (c *DefaultIndexingChain) abort() {
   201  	// E.g. close any open files in the stored fields writer:
   202  	if c.storedFieldsWriter != nil {
   203  		c.storedFieldsWriter.Abort() // ignore error
   204  	}
   205  
   206  	// E.g. close any open files in the term vectors writer:
   207  	c.termsHash.abort()
   208  
   209  	for i, _ := range c.fieldHash {
   210  		c.fieldHash[i] = nil
   211  	}
   212  }
   213  
   214  func (c *DefaultIndexingChain) rehash() {
   215  	newHashSize := 2 * len(c.fieldHash)
   216  	assert(newHashSize > len(c.fieldHash))
   217  
   218  	newHashArray := make([]*PerField, newHashSize)
   219  
   220  	// rehash
   221  	newHashMask := newHashSize - 1
   222  	for _, fp0 := range c.fieldHash {
   223  		for fp0 != nil {
   224  			hashPos2 := util.Hashstr(fp0.fieldInfo.Name) & newHashMask
   225  			fp0.next, newHashArray[hashPos2], fp0 =
   226  				newHashArray[hashPos2], fp0, fp0.next
   227  		}
   228  	}
   229  
   230  	c.fieldHash = newHashArray
   231  	c.hashMask = newHashMask
   232  }
   233  
   234  /* Calls StoredFieldsWriter.startDocument, aborting the segment if it hits any error. */
   235  func (c *DefaultIndexingChain) startStoredFields() (err error) {
   236  	var success = false
   237  	defer func() {
   238  		if !success {
   239  			c.docWriter.setAborting()
   240  		}
   241  	}()
   242  
   243  	if err = c.initStoredFieldsWriter(); err != nil {
   244  		return
   245  	}
   246  	if err = c.storedFieldsWriter.StartDocument(); err != nil {
   247  		return
   248  	}
   249  	success = true
   250  
   251  	c.lastStoredDocId++
   252  	return nil
   253  }
   254  
   255  /* Calls StoredFieldsWriter.finishDocument(), aborting the segment if it hits any error. */
   256  func (c *DefaultIndexingChain) finishStoredFields() error {
   257  	var success = false
   258  	defer func() {
   259  		if !success {
   260  			c.docWriter.setAborting()
   261  		}
   262  	}()
   263  	if err := c.storedFieldsWriter.FinishDocument(); err != nil {
   264  		return err
   265  	}
   266  	success = true
   267  	return nil
   268  }
   269  
   270  func (c *DefaultIndexingChain) processDocument() (err error) {
   271  	// How many indexed field names we've seen (collapses multiple
   272  	// field instances by the same name):
   273  	fieldCount := 0
   274  
   275  	fieldGen := c.nextFieldGen
   276  	c.nextFieldGen++
   277  
   278  	// NOTE: we need to passes here, in case there are multi-valued
   279  	// fields, because we must process all instances of a given field
   280  	// at once, since the anlayzer is free to reuse TOkenStream across
   281  	// fields (i.e., we cannot have more than one TokenStream running
   282  	// "at once"):
   283  
   284  	c.termsHash.startDocument()
   285  
   286  	if err = c.fillStoredFields(c.docState.docID); err != nil {
   287  		return
   288  	}
   289  	if err = c.startStoredFields(); err != nil {
   290  		return
   291  	}
   292  
   293  	if err = func() error {
   294  		defer func() {
   295  			if !c.docWriter.aborting {
   296  				// Finish each indexed field name seen in the document:
   297  				for _, field := range c.fields[:fieldCount] {
   298  					err = mergeError(err, field.finish())
   299  				}
   300  				err = mergeError(err, c.finishStoredFields())
   301  			}
   302  		}()
   303  
   304  		for _, field := range c.docState.doc {
   305  			if fieldCount, err = c.processField(field, fieldGen, fieldCount); err != nil {
   306  				return err
   307  			}
   308  		}
   309  		return nil
   310  	}(); err != nil {
   311  		return
   312  	}
   313  
   314  	var success = false
   315  	defer func() {
   316  		if !success {
   317  			// Must abort, on the possibility that on-disk term vectors are now corrupt:
   318  			c.docWriter.setAborting()
   319  		}
   320  	}()
   321  
   322  	if err = c.termsHash.finishDocument(); err != nil {
   323  		return
   324  	}
   325  	success = true
   326  	return nil
   327  }
   328  
   329  func (c *DefaultIndexingChain) processField(field IndexableField,
   330  	fieldGen int64, fieldCount int) (int, error) {
   331  
   332  	var fieldName string = field.Name()
   333  	var fieldType IndexableFieldType = field.FieldType()
   334  	var fp *PerField
   335  
   336  	// Invert indexed fields:
   337  	if fieldType.Indexed() {
   338  
   339  		// if the field omits norms, the boost cannot be indexed.
   340  		if fieldType.OmitNorms() && field.Boost() != 1 {
   341  			panic(fmt.Sprintf(
   342  				"You cannot set an index-time boost: norms are omitted for field '%v'",
   343  				fieldName))
   344  		}
   345  
   346  		fp = c.getOrAddField(fieldName, fieldType, true)
   347  		first := fp.fieldGen != fieldGen
   348  		if err := fp.invert(field, first); err != nil {
   349  			return 0, err
   350  		}
   351  
   352  		if first {
   353  			c.fields[fieldCount] = fp
   354  			fieldCount++
   355  			fp.fieldGen = fieldGen
   356  		}
   357  	} else {
   358  		panic("not implemented yet")
   359  	}
   360  
   361  	// Add stored fields:
   362  	if fieldType.Stored() {
   363  		if fp == nil {
   364  			panic("not implemented yet")
   365  		}
   366  		if fieldType.Stored() {
   367  			if err := func() error {
   368  				var success = false
   369  				defer func() {
   370  					if !success {
   371  						c.docWriter.setAborting()
   372  					}
   373  				}()
   374  
   375  				if err := c.storedFieldsWriter.WriteField(fp.fieldInfo, field); err != nil {
   376  					return err
   377  				}
   378  				success = true
   379  				return nil
   380  			}(); err != nil {
   381  				return 0, err
   382  			}
   383  		}
   384  	}
   385  
   386  	if dvType := fieldType.DocValueType(); int(dvType) != 0 {
   387  		if fp == nil {
   388  			panic("not implemented yet")
   389  		}
   390  		panic("not implemented yet")
   391  	}
   392  
   393  	return fieldCount, nil
   394  }
   395  
   396  /*
   397  Returns a previously created PerField, or nil if this field name
   398  wasn't seen yet.
   399  */
   400  func (c *DefaultIndexingChain) perField(name string) *PerField {
   401  	hashPos := util.Hashstr(name) & c.hashMask
   402  	fp := c.fieldHash[hashPos]
   403  	for fp != nil && fp.fieldInfo.Name != name {
   404  		fp = fp.next
   405  	}
   406  	return fp
   407  }
   408  
   409  func (c *DefaultIndexingChain) getOrAddField(name string,
   410  	fieldType IndexableFieldType, invert bool) *PerField {
   411  
   412  	// Make sure we have a PerField allocated
   413  	hashPos := util.Hashstr(name) & c.hashMask
   414  	fp := c.fieldHash[hashPos]
   415  	for fp != nil && fp.fieldInfo.Name != name && fp != fp.next {
   416  		fp = fp.next
   417  	}
   418  
   419  	if fp == nil {
   420  		// First time we are seeing this field in this segment
   421  
   422  		fi := c.fieldInfos.AddOrUpdate(name, fieldType)
   423  
   424  		fp = newPerField(c, fi, invert)
   425  		fp.next = c.fieldHash[hashPos]
   426  		c.fieldHash[hashPos] = fp
   427  		c.totalFieldCount++
   428  
   429  		// At most 50% load factor:
   430  		if c.totalFieldCount >= len(c.fieldHash)/2 {
   431  			c.rehash()
   432  		}
   433  
   434  		if c.totalFieldCount > len(c.fields) {
   435  			newFields := make([]*PerField, util.Oversize(c.totalFieldCount, util.NUM_BYTES_OBJECT_REF))
   436  			copy(newFields, c.fields)
   437  			c.fields = newFields
   438  		}
   439  
   440  	} else {
   441  		fp.fieldInfo.Update(fieldType)
   442  
   443  		if invert && fp.invertState == nil {
   444  			fp.setInvertState()
   445  		}
   446  	}
   447  
   448  	return fp
   449  }
   450  
   451  type PerField struct {
   452  	*DefaultIndexingChain // acess at least docState, termsHash.
   453  
   454  	fieldInfo  *FieldInfo
   455  	similarity Similarity
   456  
   457  	invertState       *FieldInvertState
   458  	termsHashPerField TermsHashPerField
   459  
   460  	// non-nil if this field ever had doc values in this segment:
   461  	docValuesWriter DocValuesWriter
   462  
   463  	// We use this to know when a PerField is seen for the first time
   464  	// in the current document.
   465  	fieldGen int64
   466  
   467  	// Used by the hash table
   468  	next *PerField
   469  
   470  	// Lazy init'd:
   471  	norms *NumericDocValuesWriter
   472  
   473  	// reused
   474  	tokenStream analysis.TokenStream
   475  }
   476  
   477  func newPerField(parent *DefaultIndexingChain,
   478  	fieldInfo *FieldInfo, invert bool) *PerField {
   479  
   480  	ans := &PerField{
   481  		DefaultIndexingChain: parent,
   482  		fieldInfo:            fieldInfo,
   483  		similarity:           parent.docState.similarity,
   484  		fieldGen:             -1,
   485  	}
   486  	if invert {
   487  		ans.setInvertState()
   488  	}
   489  	return ans
   490  }
   491  
   492  func (f *PerField) setInvertState() {
   493  	f.invertState = newFieldInvertState(f.fieldInfo.Name)
   494  	f.termsHashPerField = f.termsHash.addField(f.invertState, f.fieldInfo)
   495  }
   496  
   497  func (f *PerField) finish() error {
   498  	if !f.fieldInfo.OmitsNorms() {
   499  		if f.norms == nil {
   500  			f.fieldInfo.SetNormValueType(DOC_VALUES_TYPE_NUMERIC)
   501  			f.norms = newNumericDocValuesWriter(f.fieldInfo, f.docState.docWriter._bytesUsed, false)
   502  		}
   503  		f.norms.addValue(f.docState.docID, f.similarity.ComputeNorm(f.invertState))
   504  	}
   505  	return f.termsHashPerField.finish()
   506  }
   507  
   508  /*
   509  Inverts one field for one document; first is true if this is the
   510  first time we are seeing this field name in this document.
   511  */
   512  func (f *PerField) invert(field IndexableField, first bool) error {
   513  	if first {
   514  		// first time we're seeing this field (indexed) in this document:
   515  		f.invertState.reset()
   516  	}
   517  
   518  	fieldType := field.FieldType()
   519  
   520  	analyzed := fieldType.Tokenized() && f.docState.analyzer != nil
   521  
   522  	if err := func() (err error) {
   523  		// only bother checking offsets if something will consume them
   524  		// TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
   525  		checkOffsets := fieldType.IndexOptions() == INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
   526  
   527  		// To assist people in tracking down problems in analysis components,
   528  		// we wish to write the field name to the infostream when we fail.
   529  		// We expect some caller to eventually deal with the real error, so
   530  		// we don't want any error handling, but rather a deferred function
   531  		// that takes note of the problem.
   532  		aborting := false
   533  		succeededInProcessingField := false
   534  		defer func() {
   535  			if err != nil {
   536  				if _, ok := err.(util.MaxBytesLengthExceededError); ok {
   537  					aborting = false
   538  					prefix := make([]byte, 30)
   539  					bigTerm := f.invertState.termAttribute.BytesRef()
   540  					copy(prefix, bigTerm.ToBytes()[:30]) // keep at most 30 characters
   541  					if f.docState.infoStream.IsEnabled("IW") {
   542  						f.docState.infoStream.Message("IW",
   543  							"ERROR: Document contains at least one immense term in field='%v' "+
   544  								"(whose UTF8 encoding is longer than the max length %v), "+
   545  								"all of which were skipped. Please correct the analyzer to not produce such terms. "+
   546  								"The prefix of the first immense term is: '%v...', original message: %v",
   547  							f.fieldInfo.Name, MAX_TERM_LENGTH_UTF8, string(prefix), err)
   548  					}
   549  				}
   550  			}
   551  			if !succeededInProcessingField && aborting {
   552  				f.docState.docWriter.setAborting()
   553  			}
   554  
   555  			if !succeededInProcessingField && f.docState.infoStream.IsEnabled("DW") {
   556  				f.docState.infoStream.Message("DW",
   557  					"An error was returned while processing field %v",
   558  					f.fieldInfo.Name)
   559  			}
   560  		}()
   561  
   562  		var stream analysis.TokenStream
   563  		stream, err = field.TokenStream(f.docState.analyzer, f.tokenStream)
   564  		if err != nil {
   565  			return err
   566  		}
   567  		defer stream.Close()
   568  
   569  		f.tokenStream = stream
   570  		// reset the TokenStream to the first token
   571  		if err = stream.Reset(); err != nil {
   572  			return err
   573  		}
   574  
   575  		f.invertState.setAttributeSource(stream.Attributes())
   576  
   577  		f.termsHashPerField.start(field, first)
   578  
   579  		for {
   580  			var ok bool
   581  			if ok, err = stream.IncrementToken(); err != nil {
   582  				return err
   583  			}
   584  			if !ok {
   585  				break
   586  			}
   587  
   588  			// if we hit an error in stream.next below (which is fairly
   589  			// common, e.g. if analyzer chokes on a given document), then
   590  			// it's non-aborting and (above) this one document will be
   591  			// marked as deleted, but still consume a docId
   592  
   593  			posIncr := f.invertState.posIncrAttribute.PositionIncrement()
   594  			if f.invertState.position += posIncr; f.invertState.position < f.invertState.lastPosition {
   595  				assert2(posIncr != 0,
   596  					"first position increment must be > 0 (got 0) for field '%v'",
   597  					field.Name)
   598  				panic(fmt.Sprintf(
   599  					"position increments (and gaps) must be >= 0 (got %v) for field '%v'",
   600  					posIncr, field.Name))
   601  			}
   602  			f.invertState.lastPosition = f.invertState.position
   603  			if posIncr == 0 {
   604  				f.invertState.numOverlap++
   605  			}
   606  
   607  			if checkOffsets {
   608  				startOffset := f.invertState.offset + f.invertState.offsetAttribute.StartOffset()
   609  				endOffset := f.invertState.offset + f.invertState.offsetAttribute.EndOffset()
   610  				assert2(startOffset >= f.invertState.lastStartOffset && startOffset <= endOffset,
   611  					"startOffset must be non-negative, "+
   612  						"and endOffset must be >= startOffset, "+
   613  						"and offsets must not go backwards "+
   614  						"startOffset=%v,endOffset=%v,lastStartOffset=%v for field '%v'",
   615  					startOffset, endOffset, f.invertState.lastStartOffset, field.Name)
   616  				f.invertState.lastStartOffset = startOffset
   617  			}
   618  
   619  			// fmt.Printf("  term=%v\n", f.invertState.termAttribute)
   620  
   621  			// if we hit an error in here, we abort all buffered documents
   622  			// since the last flush, on the likelihood that the internal
   623  			// state of the terms hash is now corrupt and should not be
   624  			// flushed to a new segment:
   625  			aborting = true
   626  			if err = f.termsHashPerField.add(); err != nil {
   627  				return err
   628  			}
   629  			aborting = false
   630  
   631  			f.invertState.length++
   632  		}
   633  
   634  		// trigger streams to perform end-of-stream operations
   635  		if err = stream.End(); err != nil {
   636  			return err
   637  		}
   638  
   639  		// TODO: maybe add some safety? then again, it's already checked
   640  		// when we come back arond to the field...
   641  		f.invertState.position += f.invertState.posIncrAttribute.PositionIncrement()
   642  		f.invertState.offset += f.invertState.offsetAttribute.EndOffset()
   643  
   644  		// if there is an error coming through, we don't set this to true here:
   645  		succeededInProcessingField = true
   646  		return nil
   647  	}(); err != nil {
   648  		return err
   649  	}
   650  
   651  	if analyzed {
   652  		f.invertState.position += f.docState.analyzer.PositionIncrementGap(f.fieldInfo.Name)
   653  		f.invertState.offset += f.docState.analyzer.OffsetGap(f.fieldInfo.Name)
   654  	}
   655  
   656  	f.invertState.boost *= field.Boost()
   657  	return nil
   658  }