github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/termsHashConsumer.go (about) 1 package index 2 3 import ( 4 . "github.com/balzaczyy/golucene/core/codec/spi" 5 "github.com/balzaczyy/golucene/core/index/model" 6 // "github.com/balzaczyy/golucene/core/store" 7 "github.com/balzaczyy/golucene/core/util" 8 ) 9 10 // index/TermsHashConsumer.java 11 12 // type TermsHashConsumer interface { 13 // flush(map[string]TermsHashConsumerPerField, *model.SegmentWriteState) error 14 // abort() 15 // startDocument() 16 // finishDocument(*TermsHash) error 17 // addField(*TermsHashPerField, *model.FieldInfo) TermsHashConsumerPerField 18 // } 19 20 // index/TermVectorsConsumer.java 21 22 type TermVectorsConsumer struct { 23 *TermsHashImpl 24 25 writer TermVectorsWriter 26 27 docWriter *DocumentsWriterPerThread 28 29 hasVectors bool 30 numVectorsFields int 31 lastDocId int 32 perFields []*TermVectorsConsumerPerField 33 } 34 35 func newTermVectorsConsumer(docWriter *DocumentsWriterPerThread) *TermVectorsConsumer { 36 ans := &TermVectorsConsumer{ 37 docWriter: docWriter, 38 } 39 ans.TermsHashImpl = newTermsHash(ans, docWriter, false, nil) 40 return ans 41 } 42 43 func (tvc *TermVectorsConsumer) flush(fieldsToFlush map[string]TermsHashPerField, 44 state *model.SegmentWriteState) (err error) { 45 if tvc.writer != nil { 46 numDocs := state.SegmentInfo.DocCount() 47 assert(numDocs > 0) 48 // At least one doc in this run had term vectors enabled 49 func() { 50 defer func() { 51 err = mergeError(err, util.Close(tvc.writer)) 52 tvc.writer = nil 53 tvc.lastDocId = 0 54 tvc.hasVectors = false 55 }() 56 57 err = tvc.fill(numDocs) 58 if err == nil { 59 err = tvc.writer.Finish(state.FieldInfos, numDocs) 60 } 61 }() 62 if err != nil { 63 return err 64 } 65 } 66 67 return 68 } 69 70 /* 71 Fills in no-term-vectors for all docs we haven't seen since the last 72 doc that had term vectors. 73 */ 74 func (c *TermVectorsConsumer) fill(docId int) error { 75 for c.lastDocId < docId { 76 c.writer.StartDocument(0) 77 err := c.writer.FinishDocument() 78 if err != nil { 79 return err 80 } 81 c.lastDocId++ 82 } 83 return nil 84 } 85 86 func (c *TermVectorsConsumer) initTermVectorsWriter() error { 87 if c.writer == nil { 88 panic("not implemented yet") 89 } 90 return nil 91 } 92 93 type TermVectorsConsumerPerFields []*TermVectorsConsumerPerField 94 95 func (a TermVectorsConsumerPerFields) Len() int { return len(a) } 96 func (a TermVectorsConsumerPerFields) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 97 func (a TermVectorsConsumerPerFields) Less(i, j int) bool { 98 return a[i].fieldInfo.Name < a[j].fieldInfo.Name 99 } 100 101 func (c *TermVectorsConsumer) finishDocument() (err error) { 102 c.docWriter.testPoint("TermVectorsTermsWriter.finishDocument start") 103 104 if !c.hasVectors { 105 return 106 } 107 108 // Fields in term vectors are UTF16 sorted: (?) 109 util.IntroSort(TermVectorsConsumerPerFields(c.perFields[:c.numVectorsFields])) 110 111 if err = c.initTermVectorsWriter(); err != nil { 112 return 113 } 114 115 if err = c.fill(c.docState.docID); err != nil { 116 return 117 } 118 119 // Append term vectors to the real outputs: 120 if err = c.writer.StartDocument(c.numVectorsFields); err != nil { 121 return 122 } 123 for i := 0; i < c.numVectorsFields; i++ { 124 if err = c.perFields[i].finishDocument(); err != nil { 125 return 126 } 127 } 128 if err = c.writer.FinishDocument(); err != nil { 129 return 130 } 131 132 assert2(c.lastDocId == c.docState.docID, 133 "lastDocID=%v docState.docID=%v", 134 c.lastDocId, c.docState.docID) 135 136 c.lastDocId++ 137 138 c.TermsHashImpl.reset() 139 c.resetFields() 140 c.docWriter.testPoint("TermVectorsTermsWriter.finishDocument end") 141 return 142 } 143 144 func (tvc *TermVectorsConsumer) abort() { 145 tvc.hasVectors = false 146 147 defer func() { 148 if tvc.writer != nil { 149 tvc.writer.Abort() 150 tvc.writer = nil 151 } 152 153 tvc.lastDocId = 0 154 tvc.reset() 155 }() 156 157 tvc.TermsHashImpl.abort() 158 } 159 160 func (tvc *TermVectorsConsumer) resetFields() { 161 tvc.perFields = nil 162 tvc.numVectorsFields = 0 163 } 164 165 func (tvc *TermVectorsConsumer) addField(invertState *FieldInvertState, 166 fieldInfo *model.FieldInfo) TermsHashPerField { 167 return newTermVectorsConsumerPerField(invertState, tvc, fieldInfo) 168 } 169 170 func (c *TermVectorsConsumer) addFieldToFlush(fieldToFlush *TermVectorsConsumerPerField) { 171 panic("not implemented yet") 172 } 173 174 func (c *TermVectorsConsumer) startDocument() { 175 c.resetFields() 176 c.numVectorsFields = 0 177 } 178 179 // func (c *TermVectorsConsumer) clearLastVectorFieldName() bool { 180 // c.lastVectorFieldName = "" 181 // return true 182 // } 183 184 // index/FreqProxTermsWriter.java 185 186 type FreqProxTermsWriter struct { 187 *TermsHashImpl 188 } 189 190 func newFreqProxTermsWriter(docWriter *DocumentsWriterPerThread, termVectors TermsHash) *FreqProxTermsWriter { 191 ans := &FreqProxTermsWriter{} 192 ans.TermsHashImpl = newTermsHash(ans, docWriter, true, termVectors) 193 return ans 194 } 195 196 func (w *FreqProxTermsWriter) flush(fieldsToFlush map[string]TermsHashPerField, 197 state *model.SegmentWriteState) (err error) { 198 199 if err = w.TermsHashImpl.flush(fieldsToFlush, state); err != nil { 200 return 201 } 202 203 // Gather all FieldData's that have postings, across all ThreadStates 204 var allFields []*FreqProxTermsWriterPerField 205 206 for _, f := range fieldsToFlush { 207 if perField := f.(*FreqProxTermsWriterPerField); perField.bytesHash.Size() > 0 { 208 allFields = append(allFields, perField) 209 } 210 } 211 212 // Sort by field name 213 util.IntroSort(FreqProxTermsWriterPerFields(allFields)) 214 215 var consumer FieldsConsumer 216 if consumer, err = state.SegmentInfo.Codec().(Codec).PostingsFormat().FieldsConsumer(state); err != nil { 217 return 218 } 219 220 var success = false 221 defer func() { 222 if success { 223 err = util.Close(consumer) 224 } else { 225 util.CloseWhileSuppressingError(consumer) 226 } 227 }() 228 229 var termsHash TermsHash 230 // Current writer chain: 231 // FieldsConsumer 232 // -> IMPL: FormatPostingsTermsDictWriter 233 // -> TermsConsumer 234 // -> IMPL: FormatPostingsTermsDictWriter.TermsWriter 235 // -> DocsConsumer 236 // -> IMPL: FormatPostingsDocWriter 237 // -> PositionsConsumer 238 // -> IMPL: FormatPostingsPositionsWriter 239 240 for _, fieldWriter := range allFields { 241 fieldInfo := fieldWriter.fieldInfo 242 243 // If this field has postings then add them to the segment 244 if err = fieldWriter.flush(fieldInfo.Name, consumer, state); err != nil { 245 return 246 } 247 248 assert(termsHash == nil || termsHash == fieldWriter.termsHash) 249 termsHash = fieldWriter.termsHash 250 fieldWriter.reset() 251 } 252 253 if termsHash != nil { 254 termsHash.reset() 255 } 256 success = true 257 return nil 258 } 259 260 type FreqProxTermsWriterPerFields []*FreqProxTermsWriterPerField 261 262 func (a FreqProxTermsWriterPerFields) Len() int { return len(a) } 263 func (a FreqProxTermsWriterPerFields) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 264 func (a FreqProxTermsWriterPerFields) Less(i, j int) bool { 265 return a[i].fieldInfo.Name < a[j].fieldInfo.Name 266 } 267 268 func (w *FreqProxTermsWriter) addField(invertState *FieldInvertState, 269 fieldInfo *model.FieldInfo) TermsHashPerField { 270 271 return newFreqProxTermsWriterPerField(invertState, w, fieldInfo, 272 w.nextTermsHash.addField(invertState, fieldInfo)) 273 }