github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene41/postingsWriter.go (about) 1 package lucene41 2 3 import ( 4 "errors" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/codec" 7 . "github.com/balzaczyy/golucene/core/codec/spi" 8 . "github.com/balzaczyy/golucene/core/index/model" 9 "github.com/balzaczyy/golucene/core/store" 10 "github.com/balzaczyy/golucene/core/util" 11 "github.com/balzaczyy/golucene/core/util/packed" 12 "reflect" 13 ) 14 15 // Lucene41PostingsWriter.java 16 17 /* 18 Expert: the maximum number of skip levels. Smaller values result in 19 slightly smaller indexes, but slower skipping in big posting lists. 20 */ 21 const maxSkipLevels = 10 22 23 const ( 24 LUCENE41_TERMS_CODEC = "Lucene41PostingsWriterTerms" 25 LUCENE41_DOC_CODEC = "Lucene41PostingsWriterDoc" 26 LUCENE41_POS_CODEC = "Lucene41PostingsWriterPos" 27 LUCENE41_PAY_CODEC = "Lucene41PostingsWriterPay" 28 29 LUCENE41_VERSION_START = 0 30 LUCENE41_VERSION_META_ARRAY = 1 31 LUCENE41_VERSION_CHECKSUM = 2 32 LUCENE41_VERSION_CURRENT = LUCENE41_VERSION_CHECKSUM 33 ) 34 35 /* 36 Concrete class that writes docId (maybe frq,pos,offset,payloads) list 37 with postings format. 38 39 Postings list for each term will be stored separately. 40 */ 41 type Lucene41PostingsWriter struct { 42 docOut store.IndexOutput 43 posOut store.IndexOutput 44 payOut store.IndexOutput 45 46 lastState *intBlockTermState 47 48 fieldHasFreqs bool 49 fieldHasPositions bool 50 fieldHasOffsets bool 51 fieldHasPayloads bool 52 53 // Holds starting file pointers for current term: 54 docStartFP int64 55 posStartFP int64 56 payStartFP int64 57 58 docDeltaBuffer []int 59 freqBuffer []int 60 docBufferUpto int 61 62 posDeltaBuffer []int 63 payloadLengthBuffer []int 64 offsetStartDeltaBuffer []int 65 offsetLengthBuffer []int 66 posBufferUpto int 67 68 payloadBytes []byte 69 payloadByteUpto int 70 71 lastBlockDocId int 72 lastBlockPosFP int64 73 lastBlockPayFP int64 74 lastBlockPosBufferUpto int 75 lastBlockPayloadByteUpto int 76 77 lastDocId int 78 lastPosition int 79 lastStartOffset int 80 docCount int 81 82 encoded []byte 83 84 forUtil *ForUtil 85 skipWriter *SkipWriter 86 } 87 88 /* Creates a postings writer with the specified PackedInts overhead ratio */ 89 func newLucene41PostingsWriter(state *SegmentWriteState, 90 accetableOverheadRatio float32) (*Lucene41PostingsWriter, error) { 91 docOut, err := state.Directory.CreateOutput( 92 util.SegmentFileName(state.SegmentInfo.Name, 93 state.SegmentSuffix, 94 LUCENE41_DOC_EXTENSION), 95 state.Context) 96 if err != nil { 97 return nil, err 98 } 99 100 ans := new(Lucene41PostingsWriter) 101 if err = func() error { 102 var posOut store.IndexOutput 103 var payOut store.IndexOutput 104 var success = false 105 defer func() { 106 if !success { 107 util.CloseWhileSuppressingError(docOut, posOut, payOut) 108 } 109 }() 110 111 err := codec.WriteHeader(docOut, LUCENE41_DOC_CODEC, LUCENE41_VERSION_CURRENT) 112 if err != nil { 113 return err 114 } 115 ans.forUtil, err = NewForUtilInto(accetableOverheadRatio, docOut) 116 if err != nil { 117 return err 118 } 119 if state.FieldInfos.HasProx { 120 ans.posDeltaBuffer = make([]int, MAX_DATA_SIZE) 121 posOut, err = state.Directory.CreateOutput(util.SegmentFileName( 122 state.SegmentInfo.Name, state.SegmentSuffix, LUCENE41_POS_EXTENSION), 123 state.Context) 124 if err != nil { 125 return err 126 } 127 128 err = codec.WriteHeader(posOut, LUCENE41_POS_CODEC, LUCENE41_VERSION_CURRENT) 129 if err != nil { 130 return err 131 } 132 133 if state.FieldInfos.HasPayloads { 134 ans.payloadBytes = make([]byte, 128) 135 ans.payloadLengthBuffer = make([]int, MAX_DATA_SIZE) 136 } 137 138 if state.FieldInfos.HasOffsets { 139 ans.offsetStartDeltaBuffer = make([]int, MAX_DATA_SIZE) 140 ans.offsetLengthBuffer = make([]int, MAX_DATA_SIZE) 141 } 142 143 if state.FieldInfos.HasPayloads || state.FieldInfos.HasOffsets { 144 payOut, err = state.Directory.CreateOutput(util.SegmentFileName( 145 state.SegmentInfo.Name, state.SegmentSuffix, LUCENE41_PAY_EXTENSION), 146 state.Context) 147 if err != nil { 148 return err 149 } 150 err = codec.WriteHeader(payOut, LUCENE41_PAY_CODEC, LUCENE41_VERSION_CURRENT) 151 } 152 } 153 ans.payOut, ans.posOut = payOut, posOut 154 ans.docOut = docOut 155 success = true 156 return nil 157 }(); err != nil { 158 return nil, err 159 } 160 161 ans.docDeltaBuffer = make([]int, MAX_DATA_SIZE) 162 ans.freqBuffer = make([]int, MAX_DATA_SIZE) 163 ans.encoded = make([]byte, MAX_ENCODED_SIZE) 164 165 // TODO: should we try skipping every 2/4 blocks...? 166 ans.skipWriter = NewSkipWriter( 167 maxSkipLevels, 168 LUCENE41_BLOCK_SIZE, 169 state.SegmentInfo.DocCount(), 170 ans.docOut, 171 ans.posOut, 172 ans.payOut) 173 174 return ans, nil 175 } 176 177 /* Creates a postings writer with PackedInts.COMPACT */ 178 func newLucene41PostingsWriterCompact(state *SegmentWriteState) (*Lucene41PostingsWriter, error) { 179 return newLucene41PostingsWriter(state, packed.PackedInts.COMPACT) 180 } 181 182 type intBlockTermState struct { 183 *BlockTermState 184 docStartFP int64 185 posStartFP int64 186 payStartFP int64 187 skipOffset int64 188 lastPosBlockOffset int64 189 // docid when there is a single pulsed posting, otherwise -1 190 // freq is always implicitly totalTermFreq in this case. 191 singletonDocID int 192 } 193 194 var emptyState = newIntBlockTermState() 195 196 func newIntBlockTermState() *intBlockTermState { 197 ts := &intBlockTermState{ 198 skipOffset: -1, 199 lastPosBlockOffset: -1, 200 singletonDocID: -1, 201 } 202 parent := NewBlockTermState() 203 ts.BlockTermState, parent.Self = parent, ts 204 return ts 205 } 206 207 func (ts *intBlockTermState) Clone() TermState { 208 clone := newIntBlockTermState() 209 clone.CopyFrom(ts) 210 return clone 211 } 212 213 func (ts *intBlockTermState) CopyFrom(other TermState) { 214 assert(other != nil) 215 if ots, ok := other.(*intBlockTermState); ok { 216 ts.BlockTermState.CopyFrom_(ots.BlockTermState) 217 ts.docStartFP = ots.docStartFP 218 ts.posStartFP = ots.posStartFP 219 ts.payStartFP = ots.payStartFP 220 ts.lastPosBlockOffset = ots.lastPosBlockOffset 221 ts.skipOffset = ots.skipOffset 222 ts.singletonDocID = ots.singletonDocID 223 } else { 224 panic(fmt.Sprintf("Can not copy from %v", reflect.TypeOf(other).Name())) 225 } 226 } 227 228 func (ts *intBlockTermState) String() string { 229 return fmt.Sprintf("%v docStartFP=%v posStartFP=%v payStartFP=%v lastPosBlockOffset=%v skipOffset=%v singletonDocID=%v", 230 ts.BlockTermState, ts.docStartFP, ts.posStartFP, ts.payStartFP, ts.lastPosBlockOffset, ts.skipOffset, ts.singletonDocID) 231 } 232 233 func (w *Lucene41PostingsWriter) NewTermState() *BlockTermState { 234 return newIntBlockTermState().BlockTermState 235 } 236 237 func (w *Lucene41PostingsWriter) Init(termsOut store.IndexOutput) error { 238 err := codec.WriteHeader(termsOut, LUCENE41_TERMS_CODEC, LUCENE41_VERSION_CURRENT) 239 if err == nil { 240 err = termsOut.WriteVInt(LUCENE41_BLOCK_SIZE) 241 } 242 return err 243 } 244 245 func (w *Lucene41PostingsWriter) SetField(fieldInfo *FieldInfo) int { 246 n := int(fieldInfo.IndexOptions()) 247 w.fieldHasFreqs = n >= int(INDEX_OPT_DOCS_AND_FREQS) 248 w.fieldHasPositions = n >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS) 249 w.fieldHasOffsets = n >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) 250 w.fieldHasPayloads = fieldInfo.HasPayloads() 251 w.skipWriter.SetField(w.fieldHasPositions, w.fieldHasOffsets, w.fieldHasPayloads) 252 w.lastState = emptyState 253 if w.fieldHasPositions { 254 if w.fieldHasPayloads || w.fieldHasOffsets { 255 return 3 // doc + pos + pay FP 256 } else { 257 return 2 // doc + pos FP 258 } 259 } else { 260 return 1 // doc FP 261 } 262 } 263 264 func (w *Lucene41PostingsWriter) StartTerm() error { 265 w.docStartFP = w.docOut.FilePointer() 266 if w.fieldHasPositions { 267 w.posStartFP = w.posOut.FilePointer() 268 if w.fieldHasPayloads || w.fieldHasOffsets { 269 w.payStartFP = w.payOut.FilePointer() 270 } 271 } 272 w.lastDocId = 0 273 w.lastBlockDocId = -1 274 w.skipWriter.ResetSkip() 275 return nil 276 } 277 278 func (w *Lucene41PostingsWriter) StartDoc(docId, termDocFreq int) error { 279 // Have collected a block of docs, and get a new doc. Should write 280 // skip data as well as postings list for current block. 281 if w.lastBlockDocId != -1 && w.docBufferUpto == 0 { 282 if err := w.skipWriter.BufferSkip(w.lastBlockDocId, w.docCount, 283 w.lastBlockPosFP, w.lastBlockPayFP, w.lastBlockPosBufferUpto, 284 w.lastBlockPayloadByteUpto); err != nil { 285 return err 286 } 287 } 288 289 docDelta := docId - w.lastDocId 290 if docId < 0 || (w.docCount > 0 && docDelta <= 0) { 291 return errors.New(fmt.Sprintf( 292 "docs out of order (%v <= %v) (docOut : %v)", 293 docId, w.lastDocId, w.docOut)) 294 } 295 w.docDeltaBuffer[w.docBufferUpto] = docDelta 296 if w.fieldHasFreqs { 297 w.freqBuffer[w.docBufferUpto] = termDocFreq 298 } 299 w.docBufferUpto++ 300 w.docCount++ 301 302 if w.docBufferUpto == LUCENE41_BLOCK_SIZE { 303 if err := w.forUtil.writeBlock(w.docDeltaBuffer, w.encoded, w.docOut); err != nil { 304 return err 305 } 306 if w.fieldHasFreqs { 307 if err := w.forUtil.writeBlock(w.freqBuffer, w.encoded, w.docOut); err != nil { 308 return err 309 } 310 } 311 // NOTE: don't set docBufferUpto back to 0 here; finishDoc will 312 // do so (because it needs to see that the block was filled so it 313 // can save skip data) 314 } 315 316 w.lastDocId = docId 317 w.lastPosition = 0 318 w.lastStartOffset = 0 319 return nil 320 } 321 322 /* Add a new opsition & payload */ 323 func (w *Lucene41PostingsWriter) AddPosition(position int, 324 payload []byte, startOffset, endOffset int) error { 325 326 w.posDeltaBuffer[w.posBufferUpto] = position - w.lastPosition 327 if w.fieldHasPayloads { 328 if len(payload) == 0 { 329 // no paylaod 330 w.payloadLengthBuffer[w.posBufferUpto] = 0 331 } else { 332 panic("not implemented yet") 333 } 334 } 335 336 if w.fieldHasOffsets { 337 panic("not implemented yet") 338 } 339 340 w.posBufferUpto++ 341 w.lastPosition = position 342 if w.posBufferUpto == LUCENE41_BLOCK_SIZE { 343 var err error 344 if err = w.forUtil.writeBlock(w.posDeltaBuffer, w.encoded, w.posOut); err != nil { 345 return err 346 } 347 348 if w.fieldHasPayloads { 349 panic("niy") 350 } 351 if w.fieldHasOffsets { 352 panic("niy") 353 } 354 w.posBufferUpto = 0 355 } 356 return nil 357 } 358 359 func (w *Lucene41PostingsWriter) FinishDoc() error { 360 // since we don't know df for current term, we had to buffer those 361 // skip data for each block, and when a new doc comes, write them 362 // to skip file. 363 if w.docBufferUpto == LUCENE41_BLOCK_SIZE { 364 w.lastBlockDocId = w.lastDocId 365 if w.posOut != nil { 366 if w.payOut != nil { 367 w.lastBlockPayFP = w.payOut.FilePointer() 368 } 369 w.lastBlockPosFP = w.posOut.FilePointer() 370 w.lastBlockPosBufferUpto = w.posBufferUpto 371 w.lastBlockPayloadByteUpto = w.payloadByteUpto 372 } 373 w.docBufferUpto = 0 374 } 375 return nil 376 } 377 378 /* Called when we are done adding docs to this term */ 379 func (w *Lucene41PostingsWriter) FinishTerm(_state *BlockTermState) error { 380 state := _state.Self.(*intBlockTermState) 381 assert(state.DocFreq > 0) 382 383 // TODO: wasteful we are couting this (counting # docs for this term) in two places? 384 assert2(state.DocFreq == w.docCount, "%v vs %v", state.DocFreq, w.docCount) 385 386 // docFreq == 1, don't write the single docId/freq to a separate 387 // file along with a pointer to it. 388 var singletonDocId int 389 if state.DocFreq == 1 { 390 // pulse the singleton docId into the term dictionary, freq is implicitly totalTermFreq 391 singletonDocId = w.docDeltaBuffer[0] 392 } else { 393 singletonDocId = -1 394 // vInt encode the remaining doc dealtas and freqs; 395 var err error 396 for i := 0; i < w.docBufferUpto; i++ { 397 docDelta := w.docDeltaBuffer[i] 398 freq := w.freqBuffer[i] 399 if !w.fieldHasFreqs { 400 if err = w.docOut.WriteVInt(int32(docDelta)); err != nil { 401 return err 402 } 403 } else if w.freqBuffer[i] == 1 { 404 if err = w.docOut.WriteVInt(int32((docDelta << 1) | 1)); err != nil { 405 return err 406 } 407 } else { 408 if err = w.docOut.WriteVInt(int32(docDelta << 1)); err != nil { 409 return err 410 } 411 if err = w.docOut.WriteVInt(int32(freq)); err != nil { 412 return err 413 } 414 } 415 } 416 } 417 418 var lastPosBlockOffset int64 419 if w.fieldHasPositions { 420 // totalTermFreq is just total number of positions (or payloads, 421 // or offsets) associated with current term. 422 assert(state.TotalTermFreq != -1) 423 if state.TotalTermFreq > LUCENE41_BLOCK_SIZE { 424 // record file offset for last pos in last block 425 lastPosBlockOffset = w.posOut.FilePointer() - w.posStartFP 426 } else { 427 lastPosBlockOffset = -1 428 } 429 if w.posBufferUpto > 0 { 430 // TODO: should we send offsets/payloads to .pay...? seems 431 // wasteful (have to store extra vlong for low (< BLOCK_SIZE) 432 // DF terms = vast vast majority) 433 434 // vInt encode the remaining positions/payloads/offsets: 435 // lastPayloadLength := -1 // force first payload length to be written 436 // lastOffsetLength := -1 // force first offset length to be written 437 payloadBytesReadUpto := 0 438 for i := 0; i < w.posBufferUpto; i++ { 439 posDelta := w.posDeltaBuffer[i] 440 if w.fieldHasPayloads { 441 panic("not implemented yet") 442 } else { 443 err := w.posOut.WriteVInt(int32(posDelta)) 444 if err != nil { 445 return err 446 } 447 } 448 449 if w.fieldHasOffsets { 450 panic("not implemented yet") 451 } 452 } 453 454 if w.fieldHasPayloads { 455 assert(payloadBytesReadUpto == w.payloadByteUpto) 456 w.payloadByteUpto = 0 457 } 458 } 459 } else { 460 lastPosBlockOffset = -1 461 } 462 463 var skipOffset int64 464 if w.docCount > LUCENE41_BLOCK_SIZE { 465 n, err := w.skipWriter.WriteSkip(w.docOut) 466 if err != nil { 467 return err 468 } 469 skipOffset = n - w.docStartFP 470 } else { 471 skipOffset = -1 472 } 473 474 state.docStartFP = w.docStartFP 475 state.posStartFP = w.posStartFP 476 state.payStartFP = w.payStartFP 477 state.singletonDocID = singletonDocId 478 state.skipOffset = skipOffset 479 state.lastPosBlockOffset = lastPosBlockOffset 480 w.docBufferUpto = 0 481 w.posBufferUpto = 0 482 w.lastDocId = 0 483 w.docCount = 0 484 return nil 485 } 486 487 func (w *Lucene41PostingsWriter) EncodeTerm(longs []int64, 488 out util.DataOutput, fieldInfo *FieldInfo, _state *BlockTermState, 489 absolute bool) (err error) { 490 491 assert(longs != nil) 492 assert(len(longs) > 0) 493 state := _state.Self.(*intBlockTermState) 494 if absolute { 495 w.lastState = emptyState 496 } 497 longs[0] = state.docStartFP - w.lastState.docStartFP 498 if w.fieldHasPositions { 499 longs[1] = state.posStartFP - w.lastState.posStartFP 500 if w.fieldHasPayloads || w.fieldHasOffsets { 501 longs[2] = state.payStartFP - w.lastState.payStartFP 502 } 503 } 504 if state.singletonDocID != -1 { 505 if err = out.WriteVInt(int32(state.singletonDocID)); err != nil { 506 return 507 } 508 } 509 if w.fieldHasPositions && state.lastPosBlockOffset != -1 { 510 if err = out.WriteVLong(state.lastPosBlockOffset); err != nil { 511 return 512 } 513 } 514 if state.skipOffset != -1 { 515 if err = out.WriteVLong(state.skipOffset); err != nil { 516 return 517 } 518 } 519 w.lastState = state 520 return nil 521 } 522 523 func (w *Lucene41PostingsWriter) Close() (err error) { 524 var success = false 525 defer func() { 526 if success { 527 err = util.Close(w.docOut, w.posOut, w.payOut) 528 } else { 529 util.CloseWhileSuppressingError(w.docOut, w.posOut, w.payOut) 530 } 531 w.docOut = nil 532 w.posOut = nil 533 w.payOut = nil 534 }() 535 536 if err == nil && w.docOut != nil { 537 err = codec.WriteFooter(w.docOut) 538 } 539 if err == nil && w.posOut != nil { 540 err = codec.WriteFooter(w.posOut) 541 } 542 if err == nil && w.payOut != nil { 543 err = codec.WriteFooter(w.payOut) 544 } 545 if err != nil { 546 return 547 } 548 success = true 549 return nil 550 }