github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/storedFieldsIndexWriter.go (about) 1 package compressing 2 3 import ( 4 "github.com/balzaczyy/golucene/core/codec" 5 "github.com/balzaczyy/golucene/core/store" 6 "github.com/balzaczyy/golucene/core/util" 7 "github.com/balzaczyy/golucene/core/util/packed" 8 "math" 9 ) 10 11 /* number of chunks to serialize at once */ 12 const BLOCK_SIZE = 1024 13 14 /* 15 Efficient index format for block-based Codecs. 16 17 This writer generates a file which be loaded into memory using 18 memory-efficient data structures to quickly locate the block that 19 contains any document. 20 21 In order to have a compact in-memory representation, for every block 22 of 1024 chunks, this index computes the average number of bytes per 23 chunk and for every chunk, only stores the difference between 24 25 - ${chunk number} * ${average length of a chunk} 26 - and the actual start offset of the chunk 27 28 Data is written as follows: 29 30 - PackedIntsVersion, <Block>^BlockCount, BlocksEndMarker 31 - PackedIntsVersion --> VERSION_CURRENT as a vint 32 - BlocksEndMarker --> 0 as a vint, this marks the end of blocks since blocks are not allowed to start with 0 33 - Block --> BlockChunks, <Docbases>, <StartPointers> 34 - BlockChunks --> a vint which is the number of chunks encoded in the block 35 - DocBases --> DocBase, AvgChunkDocs, BitsPerDocbaseDelta, DocBaseDeltas 36 - DocBase --> first document ID of the block of chunks, as a vint 37 - AvgChunkDocs --> average number of documents in a single chunk, as a vint 38 - BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using ZigZag encoding 39 - DocBaseDeltas --> packed array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using ZigZag encoding. 40 - StartPointers --> StartointerBase, AveChunkSize, BitsPerStartPointerDelta, StartPointerDeltas 41 - StartPointerBase --> the first start ointer of the block, as a vint64 42 - AvgChunkSize --> the average size of a chunk of compressed documents, as a vint64 43 - BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using ZigZag encoding 44 - StartPointerDeltas --> packed array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using ZigZag encoding 45 46 Notes 47 48 - For any block, the doc base of the n-th chunk can be restored with 49 DocBase + AvgChunkDocs * n + DOcBsaeDeltas[n]. 50 - For any block, the start pointer of the n-th chunk can be restored 51 with StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n]. 52 - Once data is loaded into memory, you can lookup the start pointer 53 of any document by performing two binary searches: a first one based 54 on the values of DocBase in order to find the right block, and then 55 inside the block based on DocBaseDeltas (by reconstructing the doc 56 bases for every chunk). 57 */ 58 type StoredFieldsIndexWriter struct { 59 fieldsIndexOut store.IndexOutput 60 totalDocs int 61 blockDocs int 62 blockChunks int 63 firstStartPointer int64 64 maxStartPointer int64 65 docBaseDeltas []int 66 startPointerDeltas []int64 67 } 68 69 func NewStoredFieldsIndexWriter(indexOutput store.IndexOutput) (*StoredFieldsIndexWriter, error) { 70 err := indexOutput.WriteVInt(packed.VERSION_CURRENT) 71 if err != nil { 72 return nil, err 73 } 74 return &StoredFieldsIndexWriter{ 75 fieldsIndexOut: indexOutput, 76 blockChunks: 0, 77 blockDocs: 0, 78 firstStartPointer: -1, 79 totalDocs: 0, 80 docBaseDeltas: make([]int, BLOCK_SIZE), 81 startPointerDeltas: make([]int64, BLOCK_SIZE), 82 }, nil 83 } 84 85 func (w *StoredFieldsIndexWriter) reset() { 86 w.blockChunks = 0 87 w.blockDocs = 0 88 w.firstStartPointer = -1 // means unset 89 } 90 91 func (w *StoredFieldsIndexWriter) writeBlock() error { 92 assert(w.blockChunks > 0) 93 err := w.fieldsIndexOut.WriteVInt(int32(w.blockChunks)) 94 if err != nil { 95 return err 96 } 97 98 // The trick here is that we only store the difference from the 99 // average start pointer or doc base, this helps save bits per 100 // value. And in order to prevent a few chunks that would be far 101 // from the average to raise the number of bits per value for all 102 // of them, we only encode blocks of 1024 chunks at once. 103 // See LUCENE-4512 104 105 // doc bases 106 var avgChunkDocs int 107 if w.blockChunks == 1 { 108 avgChunkDocs = 0 109 } else { 110 avgChunkDocs = int(math.Floor(float64(w.blockDocs-w.docBaseDeltas[w.blockChunks-1])/float64(w.blockChunks-1) + 0.5)) 111 } 112 err = w.fieldsIndexOut.WriteVInt(int32(w.totalDocs - w.blockDocs)) // doc base 113 if err == nil { 114 err = w.fieldsIndexOut.WriteVInt(int32(avgChunkDocs)) 115 } 116 if err != nil { 117 return err 118 } 119 var docBase int = 0 120 var maxDelta int64 = 0 121 for i := 0; i < w.blockChunks; i++ { 122 delta := docBase - avgChunkDocs*i 123 maxDelta |= util.ZigZagEncodeLong(int64(delta)) 124 docBase += w.docBaseDeltas[i] 125 } 126 127 bitsPerDocbase := packed.BitsRequired(maxDelta) 128 err = w.fieldsIndexOut.WriteVInt(int32(bitsPerDocbase)) 129 if err != nil { 130 return err 131 } 132 writer := packed.WriterNoHeader(w.fieldsIndexOut, 133 packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerDocbase, 1) 134 docBase = 0 135 for i := 0; i < w.blockChunks; i++ { 136 delta := docBase - avgChunkDocs*i 137 assert(packed.BitsRequired(util.ZigZagEncodeLong(int64(delta))) <= writer.BitsPerValue()) 138 err = writer.Add(util.ZigZagEncodeLong(int64(delta))) 139 if err != nil { 140 return err 141 } 142 docBase += w.docBaseDeltas[i] 143 } 144 err = writer.Finish() 145 if err != nil { 146 return err 147 } 148 149 // start pointers 150 w.fieldsIndexOut.WriteVLong(w.firstStartPointer) 151 var avgChunkSize int64 152 if w.blockChunks == 1 { 153 avgChunkSize = 0 154 } else { 155 avgChunkSize = (w.maxStartPointer - w.firstStartPointer) / int64(w.blockChunks-1) 156 } 157 err = w.fieldsIndexOut.WriteVLong(avgChunkSize) 158 if err != nil { 159 return err 160 } 161 var startPointer int64 = 0 162 maxDelta = 0 163 for i := 0; i < w.blockChunks; i++ { 164 startPointer += w.startPointerDeltas[i] 165 delta := startPointer - avgChunkSize*int64(i) 166 maxDelta |= util.ZigZagEncodeLong(delta) 167 } 168 169 bitsPerStartPointer := packed.BitsRequired(maxDelta) 170 err = w.fieldsIndexOut.WriteVInt(int32(bitsPerStartPointer)) 171 if err != nil { 172 return err 173 } 174 writer = packed.WriterNoHeader(w.fieldsIndexOut, 175 packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerStartPointer, 1) 176 startPointer = 0 177 for i := 0; i < w.blockChunks; i++ { 178 startPointer += w.startPointerDeltas[i] 179 delta := startPointer - avgChunkSize*int64(i) 180 assert(packed.BitsRequired(util.ZigZagEncodeLong(delta)) <= writer.BitsPerValue()) 181 err = writer.Add(util.ZigZagEncodeLong(delta)) 182 if err != nil { 183 return err 184 } 185 } 186 return writer.Finish() 187 } 188 189 func (w *StoredFieldsIndexWriter) writeIndex(numDocs int, startPointer int64) error { 190 if w.blockChunks == BLOCK_SIZE { 191 err := w.writeBlock() 192 if err != nil { 193 return err 194 } 195 w.reset() 196 } 197 198 if w.firstStartPointer == -1 { 199 w.firstStartPointer, w.maxStartPointer = startPointer, startPointer 200 } 201 assert(w.firstStartPointer > 0 && startPointer >= w.firstStartPointer) 202 203 w.docBaseDeltas[w.blockChunks] = numDocs 204 w.startPointerDeltas[w.blockChunks] = startPointer - w.maxStartPointer 205 206 w.blockChunks++ 207 w.blockDocs += numDocs 208 w.totalDocs += numDocs 209 w.maxStartPointer = startPointer 210 return nil 211 } 212 213 func (w *StoredFieldsIndexWriter) finish(numDocs int, maxPointer int64) (err error) { 214 assert(w != nil) 215 assert2(numDocs == w.totalDocs, "Expected %v docs, but got %v", numDocs, w.totalDocs) 216 if w.blockChunks > 0 { 217 if err = w.writeBlock(); err != nil { 218 return 219 } 220 } 221 if err = w.fieldsIndexOut.WriteVInt(0); err != nil { // end marker 222 return 223 } 224 if err = w.fieldsIndexOut.WriteVLong(maxPointer); err != nil { 225 return 226 } 227 return codec.WriteFooter(w.fieldsIndexOut) 228 } 229 230 func (w *StoredFieldsIndexWriter) Close() error { 231 if w == nil { 232 return nil 233 } 234 assert(w.fieldsIndexOut != nil) 235 return w.fieldsIndexOut.Close() 236 }