github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/storedFieldsIndexWriter.go (about)

     1  package compressing
     2  
     3  import (
     4  	"github.com/balzaczyy/golucene/core/codec"
     5  	"github.com/balzaczyy/golucene/core/store"
     6  	"github.com/balzaczyy/golucene/core/util"
     7  	"github.com/balzaczyy/golucene/core/util/packed"
     8  	"math"
     9  )
    10  
    11  /* number of chunks to serialize at once */
    12  const BLOCK_SIZE = 1024
    13  
    14  /*
    15  Efficient index format for block-based Codecs.
    16  
    17  This writer generates a file which be loaded into memory using
    18  memory-efficient data structures to quickly locate the block that
    19  contains any document.
    20  
    21  In order to have a compact in-memory representation, for every block
    22  of 1024 chunks, this index computes the average number of bytes per
    23  chunk and for every chunk, only stores the difference between
    24  
    25  - ${chunk number} * ${average length of a chunk}
    26  - and the actual start offset of the chunk
    27  
    28  Data is written as follows:
    29  
    30  	- PackedIntsVersion, <Block>^BlockCount, BlocksEndMarker
    31  	- PackedIntsVersion --> VERSION_CURRENT as a vint
    32  	- BlocksEndMarker --> 0 as a vint, this marks the end of blocks since blocks are not allowed to start with 0
    33  	- Block --> BlockChunks, <Docbases>, <StartPointers>
    34  	- BlockChunks --> a vint which is the number of chunks encoded in the block
    35  	- DocBases --> DocBase, AvgChunkDocs, BitsPerDocbaseDelta, DocBaseDeltas
    36  	- DocBase --> first document ID of the block of chunks, as a vint
    37  	- AvgChunkDocs --> average number of documents in a single chunk, as a vint
    38  	- BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using ZigZag encoding
    39  	- DocBaseDeltas --> packed array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using ZigZag encoding.
    40  	- StartPointers --> StartointerBase, AveChunkSize, BitsPerStartPointerDelta, StartPointerDeltas
    41  	- StartPointerBase --> the first start ointer of the block, as a vint64
    42  	- AvgChunkSize --> the average size of a chunk of compressed documents, as a vint64
    43  	- BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using ZigZag encoding
    44  	- StartPointerDeltas --> packed array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using ZigZag encoding
    45  
    46  Notes
    47  
    48  - For any block, the doc base of the n-th chunk can be restored with
    49  DocBase + AvgChunkDocs * n + DOcBsaeDeltas[n].
    50  - For any block, the start pointer of the n-th chunk can be restored
    51  with StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n].
    52  - Once data is loaded into memory, you can lookup the start pointer
    53  of any document by performing two binary searches: a first one based
    54  on the values of DocBase in order to find the right block, and then
    55  inside the block based on DocBaseDeltas (by reconstructing the doc
    56  bases for every chunk).
    57  */
    58  type StoredFieldsIndexWriter struct {
    59  	fieldsIndexOut     store.IndexOutput
    60  	totalDocs          int
    61  	blockDocs          int
    62  	blockChunks        int
    63  	firstStartPointer  int64
    64  	maxStartPointer    int64
    65  	docBaseDeltas      []int
    66  	startPointerDeltas []int64
    67  }
    68  
    69  func NewStoredFieldsIndexWriter(indexOutput store.IndexOutput) (*StoredFieldsIndexWriter, error) {
    70  	err := indexOutput.WriteVInt(packed.VERSION_CURRENT)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  	return &StoredFieldsIndexWriter{
    75  		fieldsIndexOut:     indexOutput,
    76  		blockChunks:        0,
    77  		blockDocs:          0,
    78  		firstStartPointer:  -1,
    79  		totalDocs:          0,
    80  		docBaseDeltas:      make([]int, BLOCK_SIZE),
    81  		startPointerDeltas: make([]int64, BLOCK_SIZE),
    82  	}, nil
    83  }
    84  
    85  func (w *StoredFieldsIndexWriter) reset() {
    86  	w.blockChunks = 0
    87  	w.blockDocs = 0
    88  	w.firstStartPointer = -1 // means unset
    89  }
    90  
    91  func (w *StoredFieldsIndexWriter) writeBlock() error {
    92  	assert(w.blockChunks > 0)
    93  	err := w.fieldsIndexOut.WriteVInt(int32(w.blockChunks))
    94  	if err != nil {
    95  		return err
    96  	}
    97  
    98  	// The trick here is that we only store the difference from the
    99  	// average start pointer or doc base, this helps save bits per
   100  	// value. And in order to prevent a few chunks that would be far
   101  	// from the average to raise the number of bits per value for all
   102  	// of them, we only encode blocks of 1024 chunks at once.
   103  	// See LUCENE-4512
   104  
   105  	// doc bases
   106  	var avgChunkDocs int
   107  	if w.blockChunks == 1 {
   108  		avgChunkDocs = 0
   109  	} else {
   110  		avgChunkDocs = int(math.Floor(float64(w.blockDocs-w.docBaseDeltas[w.blockChunks-1])/float64(w.blockChunks-1) + 0.5))
   111  	}
   112  	err = w.fieldsIndexOut.WriteVInt(int32(w.totalDocs - w.blockDocs)) // doc base
   113  	if err == nil {
   114  		err = w.fieldsIndexOut.WriteVInt(int32(avgChunkDocs))
   115  	}
   116  	if err != nil {
   117  		return err
   118  	}
   119  	var docBase int = 0
   120  	var maxDelta int64 = 0
   121  	for i := 0; i < w.blockChunks; i++ {
   122  		delta := docBase - avgChunkDocs*i
   123  		maxDelta |= util.ZigZagEncodeLong(int64(delta))
   124  		docBase += w.docBaseDeltas[i]
   125  	}
   126  
   127  	bitsPerDocbase := packed.BitsRequired(maxDelta)
   128  	err = w.fieldsIndexOut.WriteVInt(int32(bitsPerDocbase))
   129  	if err != nil {
   130  		return err
   131  	}
   132  	writer := packed.WriterNoHeader(w.fieldsIndexOut,
   133  		packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerDocbase, 1)
   134  	docBase = 0
   135  	for i := 0; i < w.blockChunks; i++ {
   136  		delta := docBase - avgChunkDocs*i
   137  		assert(packed.BitsRequired(util.ZigZagEncodeLong(int64(delta))) <= writer.BitsPerValue())
   138  		err = writer.Add(util.ZigZagEncodeLong(int64(delta)))
   139  		if err != nil {
   140  			return err
   141  		}
   142  		docBase += w.docBaseDeltas[i]
   143  	}
   144  	err = writer.Finish()
   145  	if err != nil {
   146  		return err
   147  	}
   148  
   149  	// start pointers
   150  	w.fieldsIndexOut.WriteVLong(w.firstStartPointer)
   151  	var avgChunkSize int64
   152  	if w.blockChunks == 1 {
   153  		avgChunkSize = 0
   154  	} else {
   155  		avgChunkSize = (w.maxStartPointer - w.firstStartPointer) / int64(w.blockChunks-1)
   156  	}
   157  	err = w.fieldsIndexOut.WriteVLong(avgChunkSize)
   158  	if err != nil {
   159  		return err
   160  	}
   161  	var startPointer int64 = 0
   162  	maxDelta = 0
   163  	for i := 0; i < w.blockChunks; i++ {
   164  		startPointer += w.startPointerDeltas[i]
   165  		delta := startPointer - avgChunkSize*int64(i)
   166  		maxDelta |= util.ZigZagEncodeLong(delta)
   167  	}
   168  
   169  	bitsPerStartPointer := packed.BitsRequired(maxDelta)
   170  	err = w.fieldsIndexOut.WriteVInt(int32(bitsPerStartPointer))
   171  	if err != nil {
   172  		return err
   173  	}
   174  	writer = packed.WriterNoHeader(w.fieldsIndexOut,
   175  		packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerStartPointer, 1)
   176  	startPointer = 0
   177  	for i := 0; i < w.blockChunks; i++ {
   178  		startPointer += w.startPointerDeltas[i]
   179  		delta := startPointer - avgChunkSize*int64(i)
   180  		assert(packed.BitsRequired(util.ZigZagEncodeLong(delta)) <= writer.BitsPerValue())
   181  		err = writer.Add(util.ZigZagEncodeLong(delta))
   182  		if err != nil {
   183  			return err
   184  		}
   185  	}
   186  	return writer.Finish()
   187  }
   188  
   189  func (w *StoredFieldsIndexWriter) writeIndex(numDocs int, startPointer int64) error {
   190  	if w.blockChunks == BLOCK_SIZE {
   191  		err := w.writeBlock()
   192  		if err != nil {
   193  			return err
   194  		}
   195  		w.reset()
   196  	}
   197  
   198  	if w.firstStartPointer == -1 {
   199  		w.firstStartPointer, w.maxStartPointer = startPointer, startPointer
   200  	}
   201  	assert(w.firstStartPointer > 0 && startPointer >= w.firstStartPointer)
   202  
   203  	w.docBaseDeltas[w.blockChunks] = numDocs
   204  	w.startPointerDeltas[w.blockChunks] = startPointer - w.maxStartPointer
   205  
   206  	w.blockChunks++
   207  	w.blockDocs += numDocs
   208  	w.totalDocs += numDocs
   209  	w.maxStartPointer = startPointer
   210  	return nil
   211  }
   212  
   213  func (w *StoredFieldsIndexWriter) finish(numDocs int, maxPointer int64) (err error) {
   214  	assert(w != nil)
   215  	assert2(numDocs == w.totalDocs, "Expected %v docs, but got %v", numDocs, w.totalDocs)
   216  	if w.blockChunks > 0 {
   217  		if err = w.writeBlock(); err != nil {
   218  			return
   219  		}
   220  	}
   221  	if err = w.fieldsIndexOut.WriteVInt(0); err != nil { // end marker
   222  		return
   223  	}
   224  	if err = w.fieldsIndexOut.WriteVLong(maxPointer); err != nil {
   225  		return
   226  	}
   227  	return codec.WriteFooter(w.fieldsIndexOut)
   228  }
   229  
   230  func (w *StoredFieldsIndexWriter) Close() error {
   231  	if w == nil {
   232  		return nil
   233  	}
   234  	assert(w.fieldsIndexOut != nil)
   235  	return w.fieldsIndexOut.Close()
   236  }