github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/docConsumer.go (about)

     1  package index
     2  
     3  import (
     4  	"github.com/balzaczyy/golucene/core/index/model"
     5  	// "github.com/balzaczyy/golucene/core/store"
     6  	// "github.com/balzaczyy/golucene/core/util"
     7  )
     8  
     9  // index/DocConsumer.java
    10  
    11  type DocConsumer interface {
    12  	processDocument() error
    13  	flush(state *model.SegmentWriteState) error
    14  	abort()
    15  }
    16  
    17  // // index/DocFieldProcessor.java
    18  
    19  // /*
    20  // This is a DocConsumer that gathers all fields under the same name,
    21  // and calls per-field consumers to process field by field. This class
    22  // doesn't do any "real" work of its own: it just forwards the fields to
    23  // a DocFieldConsumer.
    24  // */
    25  // type DocFieldProcessor struct {
    26  // 	consumer       DocFieldConsumer
    27  // 	storedConsumer StoredFieldsConsumer
    28  // 	codec          Codec
    29  
    30  // 	// Holds all fields seen in current doc
    31  // 	_fields    []*DocFieldProcessorPerField
    32  // 	fieldCount int
    33  
    34  // 	// Hash table for all fields ever seen
    35  // 	fieldHash       []*DocFieldProcessorPerField
    36  // 	hashMask        int
    37  // 	totalFieldCount int
    38  
    39  // 	fieldGen int
    40  
    41  // 	docState *docState
    42  
    43  // 	bytesUsed util.Counter
    44  // }
    45  
    46  // func newDocFieldProcessor(docWriter *DocumentsWriterPerThread,
    47  // 	consumer DocFieldConsumer, storedConsumer StoredFieldsConsumer) *DocFieldProcessor {
    48  
    49  // 	assert(storedConsumer != nil)
    50  // 	return &DocFieldProcessor{
    51  // 		_fields:        make([]*DocFieldProcessorPerField, 1),
    52  // 		fieldHash:      make([]*DocFieldProcessorPerField, 2),
    53  // 		hashMask:       1,
    54  // 		docState:       docWriter.docState,
    55  // 		codec:          docWriter.codec,
    56  // 		bytesUsed:      docWriter._bytesUsed,
    57  // 		consumer:       consumer,
    58  // 		storedConsumer: storedConsumer,
    59  // 	}
    60  // }
    61  
    62  // func (p *DocFieldProcessor) flush(state *model.SegmentWriteState) error {
    63  // 	childFields := make(map[string]DocFieldConsumerPerField)
    64  // 	for _, f := range p.fields() {
    65  // 		childFields[f.fieldInfo().Name] = f
    66  // 	}
    67  
    68  // 	err := p.storedConsumer.flush(state)
    69  // 	if err != nil {
    70  // 		return err
    71  // 	}
    72  // 	err = p.consumer.flush(childFields, state)
    73  // 	if err != nil {
    74  // 		return err
    75  // 	}
    76  
    77  // 	// Impotant to save after asking consumer to flush so consumer can
    78  // 	// alter the FieldInfo if necessary. E.g., FreqProxTermsWriter does
    79  // 	// this with FieldInfo.storePayload.
    80  // 	infosWriter := p.codec.FieldInfosFormat().FieldInfosWriter()
    81  // 	assert(infosWriter != nil)
    82  // 	return infosWriter(state.Directory, state.SegmentInfo.Name,
    83  // 		state.FieldInfos, store.IO_CONTEXT_DEFAULT)
    84  // }
    85  
    86  // func (p *DocFieldProcessor) abort() {
    87  // 	for _, field := range p.fieldHash {
    88  // 		for field != nil {
    89  // 			next := field.next
    90  // 			field.abort()
    91  // 			field = next
    92  // 		}
    93  // 	}
    94  // 	p.storedConsumer.abort()
    95  // 	p.consumer.abort()
    96  // 	// assert2(err == nil, err.Error())
    97  // }
    98  
    99  // func (p *DocFieldProcessor) fields() []DocFieldConsumerPerField {
   100  // 	var fields []DocFieldConsumerPerField
   101  // 	for _, field := range p.fieldHash {
   102  // 		for field != nil {
   103  // 			fields = append(fields, field.consumer)
   104  // 			field = field.next
   105  // 		}
   106  // 	}
   107  // 	assert(len(fields) == p.totalFieldCount)
   108  // 	return fields
   109  // }
   110  
   111  // func (p *DocFieldProcessor) rehash() {
   112  // 	newHashSize := len(p.fieldHash) * 2
   113  // 	assert(newHashSize > len(p.fieldHash)) // avoid overflow
   114  
   115  // 	newHashArray := make([]*DocFieldProcessorPerField, newHashSize)
   116  
   117  // 	// Rehash
   118  // 	newHashMask := newHashSize - 1
   119  // 	for _, fp0 := range p.fieldHash {
   120  // 		for fp0 != nil {
   121  // 			hashPos2 := hashstr(fp0.fieldInfo.Name) & newHashMask
   122  // 			nextFP0 := fp0.next
   123  // 			fp0.next = newHashArray[hashPos2]
   124  // 			newHashArray[hashPos2] = fp0
   125  // 			fp0 = nextFP0
   126  // 		}
   127  // 	}
   128  
   129  // 	p.fieldHash = newHashArray
   130  // 	p.hashMask = newHashMask
   131  // }
   132  
   133  // func (p *DocFieldProcessor) processDocument(fieldInfos *model.FieldInfosBuilder) error {
   134  // 	p.consumer.startDocument()
   135  // 	p.storedConsumer.startDocument()
   136  
   137  // 	p.fieldCount = 0
   138  
   139  // 	thisFieldGen := p.fieldGen
   140  // 	p.fieldGen++
   141  
   142  // 	// Absorb any new fields first seen in this document. Also absort
   143  // 	// any changes to fields we had already seen before (e.g. suddenly
   144  // 	// turning on norms or vectors, etc.)
   145  
   146  // 	for _, field := range p.docState.doc {
   147  // 		fieldName := field.Name()
   148  
   149  // 		// Make sure we have a PerField allocated
   150  // 		hashPos := hashstr(fieldName) & p.hashMask
   151  // 		fp := p.fieldHash[hashPos]
   152  // 		for fp != nil && fp.fieldInfo.Name != fieldName {
   153  // 			fp = fp.next
   154  // 		}
   155  
   156  // 		if fp == nil {
   157  // 			// TODO FI: we need to genericize the "flags" that a field
   158  // 			// holds, and, how these flags are merged; it needs to be more
   159  // 			// "pluggable" such that if I want to have a new "thing" my
   160  // 			// Fields can do, I can easily add it
   161  // 			fi := fieldInfos.AddOrUpdate(fieldName, field.FieldType())
   162  
   163  // 			fp = newDocFieldProcessorPerField(p, fi)
   164  // 			fp.next = p.fieldHash[hashPos]
   165  // 			p.fieldHash[hashPos] = fp
   166  // 			p.totalFieldCount++
   167  
   168  // 			if p.totalFieldCount >= len(p.fieldHash)/2 {
   169  // 				p.rehash()
   170  // 			}
   171  // 		} else {
   172  // 			panic("not implemented yet")
   173  // 		}
   174  
   175  // 		if thisFieldGen != fp.lastGen {
   176  // 			// First time we're seeing this field for this doc
   177  // 			fp.fieldCount = 0
   178  
   179  // 			if p.fieldCount == len(p._fields) {
   180  // 				newSize := len(p._fields) * 2
   181  // 				newArray := make([]*DocFieldProcessorPerField, newSize)
   182  // 				copy(newArray, p._fields[:p.fieldCount])
   183  // 				p._fields = newArray
   184  // 			}
   185  
   186  // 			p._fields[p.fieldCount] = fp
   187  // 			p.fieldCount++
   188  // 			fp.lastGen = thisFieldGen
   189  // 		}
   190  
   191  // 		fp.addField(field)
   192  // 		p.storedConsumer.addField(p.docState.docID, field, fp.fieldInfo)
   193  // 	}
   194  
   195  // 	// If we are writing vectors then we must visit fields in sorted
   196  // 	// order so they are written in sorted order. TODO: we actually
   197  // 	// only need to sort the subset of fields that have vectors enabled;
   198  // 	// we could save [small amount of] CPU here.
   199  // 	util.IntroSort(ByNameDocFieldProcessorPerFields(p._fields[:p.fieldCount]))
   200  // 	for _, perField := range p._fields[:p.fieldCount] {
   201  // 		err := perField.consumer.processFields(perField.fields, perField.fieldCount)
   202  // 		if err != nil {
   203  // 			return err
   204  // 		}
   205  // 	}
   206  
   207  // 	if prefix, is := p.docState.maxTermPrefix, p.docState.infoStream; prefix != "" && is.IsEnabled("IW") {
   208  // 		is.Message("IW",
   209  // 			"WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length %v), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '%v...'",
   210  // 			MAX_TERM_LENGTH_UTF8,
   211  // 			prefix)
   212  // 		p.docState.maxTermPrefix = ""
   213  // 	}
   214  
   215  // 	return nil
   216  // }
   217  
   218  // type ByNameDocFieldProcessorPerFields []*DocFieldProcessorPerField
   219  
   220  // func (a ByNameDocFieldProcessorPerFields) Len() int      { return len(a) }
   221  // func (a ByNameDocFieldProcessorPerFields) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
   222  // func (a ByNameDocFieldProcessorPerFields) Less(i, j int) bool {
   223  // 	return a[i].fieldInfo.Name < a[j].fieldInfo.Name
   224  // }
   225  
   226  // func (p *DocFieldProcessor) finishDocument() (err error) {
   227  // 	defer func() {
   228  // 		err = mergeError(err, p.consumer.finishDocument())
   229  // 	}()
   230  // 	return p.storedConsumer.finishDocument()
   231  // }
   232  
   233  // // index/DocFieldProcessorPerField.java
   234  
   235  // /* Holds all per thread, per field state. */
   236  // type DocFieldProcessorPerField struct {
   237  // 	consumer  DocFieldConsumerPerField
   238  // 	fieldInfo *model.FieldInfo
   239  
   240  // 	next    *DocFieldProcessorPerField
   241  // 	lastGen int // -1
   242  
   243  // 	fieldCount int
   244  // 	fields     []model.IndexableField
   245  // }
   246  
   247  // func newDocFieldProcessorPerField(docFieldProcessor *DocFieldProcessor,
   248  // 	fieldInfo *model.FieldInfo) *DocFieldProcessorPerField {
   249  // 	return &DocFieldProcessorPerField{
   250  // 		consumer:  docFieldProcessor.consumer.addField(fieldInfo),
   251  // 		lastGen:   -1,
   252  // 		fieldInfo: fieldInfo,
   253  // 	}
   254  // }
   255  
   256  // func (f *DocFieldProcessorPerField) addField(field model.IndexableField) {
   257  // 	if f.fieldCount == len(f.fields) {
   258  // 		newSize := util.Oversize(f.fieldCount+1, util.NUM_BYTES_OBJECT_REF)
   259  // 		newArray := make([]model.IndexableField, newSize)
   260  // 		copy(newArray, f.fields)
   261  // 		f.fields = newArray
   262  // 	}
   263  // 	f.fields[f.fieldCount] = field
   264  // 	f.fieldCount++
   265  // }
   266  
   267  // func (f *DocFieldProcessorPerField) abort() {
   268  // 	f.consumer.abort()
   269  // }