github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/storedFieldsWriter.go (about) 1 package compressing 2 3 import ( 4 "errors" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/codec" 7 "github.com/balzaczyy/golucene/core/codec/lucene40" 8 "github.com/balzaczyy/golucene/core/index/model" 9 "github.com/balzaczyy/golucene/core/store" 10 "github.com/balzaczyy/golucene/core/util" 11 "github.com/balzaczyy/golucene/core/util/packed" 12 "math" 13 ) 14 15 /* hard limit on the maximum number of documents per chunk */ 16 const MAX_DOCUMENTS_PER_CHUNK = 128 17 18 const ( 19 STRING = 0x00 20 BYTE_ARR = 0x01 21 NUMERIC_INT = 0x02 22 NUMERIC_FLOAT = 0x03 23 NUMERIC_LONG = 0x04 24 NUMERIC_DOUBLE = 0x05 25 ) 26 27 var ( 28 TYPE_BITS = packed.BitsRequired(NUMERIC_DOUBLE) 29 TYPE_MASK = int(packed.MaxValue(TYPE_BITS)) 30 ) 31 32 const ( 33 CODEC_SFX_IDX = "Index" 34 CODEC_SFX_DAT = "Data" 35 VERSION_START = 0 36 VERSION_BIG_CHUNKS = 1 37 VERSION_CHECKSUM = 2 38 VERSION_CURRENT = VERSION_CHECKSUM 39 ) 40 41 /* StoredFieldsWriter impl for CompressingStoredFieldsFormat */ 42 type CompressingStoredFieldsWriter struct { 43 directory store.Directory 44 segment string 45 segmentSuffix string 46 indexWriter *StoredFieldsIndexWriter 47 fieldsStream store.IndexOutput 48 49 compressionMode CompressionMode 50 compressor Compressor 51 chunkSize int 52 53 bufferedDocs *GrowableByteArrayDataOutput 54 numStoredFields []int // number of stored fields 55 endOffsets []int // ned offsets in bufferedDocs 56 docBase int // doc ID at the beginning of the chunk 57 numBufferedDocs int // docBase + numBufferedDocs == current doc ID 58 59 numStoredFieldsInDoc int 60 } 61 62 func NewCompressingStoredFieldsWriter(dir store.Directory, si *model.SegmentInfo, 63 segmentSuffix string, ctx store.IOContext, formatName string, 64 compressionMode CompressionMode, chunkSize int) (*CompressingStoredFieldsWriter, error) { 65 66 assert(dir != nil) 67 ans := &CompressingStoredFieldsWriter{ 68 directory: dir, 69 segment: si.Name, 70 segmentSuffix: segmentSuffix, 71 compressionMode: compressionMode, 72 compressor: compressionMode.NewCompressor(), 73 chunkSize: chunkSize, 74 docBase: 0, 75 bufferedDocs: newGrowableByteArrayDataOutput(chunkSize), 76 numStoredFields: make([]int, 16), 77 endOffsets: make([]int, 16), 78 numBufferedDocs: 0, 79 } 80 81 var success = false 82 indexStream, err := dir.CreateOutput(util.SegmentFileName(si.Name, segmentSuffix, 83 lucene40.FIELDS_INDEX_EXTENSION), ctx) 84 if err != nil { 85 return nil, err 86 } 87 assert(indexStream != nil) 88 defer func() { 89 if !success { 90 util.CloseWhileSuppressingError(indexStream) 91 ans.Abort() 92 } 93 }() 94 95 ans.fieldsStream, err = dir.CreateOutput(util.SegmentFileName(si.Name, segmentSuffix, 96 lucene40.FIELDS_EXTENSION), ctx) 97 if err != nil { 98 return nil, err 99 } 100 101 codecNameIdx := formatName + CODEC_SFX_IDX 102 codecNameDat := formatName + CODEC_SFX_DAT 103 err = codec.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT) 104 if err != nil { 105 return nil, err 106 } 107 err = codec.WriteHeader(ans.fieldsStream, codecNameDat, VERSION_CURRENT) 108 if err != nil { 109 return nil, err 110 } 111 assert(int64(codec.HeaderLength(codecNameIdx)) == indexStream.FilePointer()) 112 assert(int64(codec.HeaderLength(codecNameDat)) == ans.fieldsStream.FilePointer()) 113 114 ans.indexWriter, err = NewStoredFieldsIndexWriter(indexStream) 115 if err != nil { 116 return nil, err 117 } 118 assert(ans.indexWriter != nil) 119 indexStream = nil 120 121 err = ans.fieldsStream.WriteVInt(int32(chunkSize)) 122 if err != nil { 123 return nil, err 124 } 125 err = ans.fieldsStream.WriteVInt(packed.VERSION_CURRENT) 126 if err != nil { 127 return nil, err 128 } 129 130 success = true 131 return ans, nil 132 } 133 134 func assert(ok bool) { 135 assert2(ok, "assert fail") 136 } 137 138 func assert2(ok bool, msg string, args ...interface{}) { 139 if !ok { 140 panic(fmt.Sprintf(msg, args...)) 141 } 142 } 143 144 func (w *CompressingStoredFieldsWriter) Close() error { 145 assert(w != nil) 146 defer func() { 147 if w != nil { 148 w.fieldsStream = nil 149 w.indexWriter = nil 150 } 151 }() 152 return util.Close(w.fieldsStream, w.indexWriter) 153 } 154 155 func (w *CompressingStoredFieldsWriter) StartDocument() error { return nil } 156 157 func (w *CompressingStoredFieldsWriter) FinishDocument() error { 158 if w.numBufferedDocs == len(w.numStoredFields) { 159 newLength := util.Oversize(w.numBufferedDocs+1, 4) 160 161 oldArray := w.endOffsets 162 w.endOffsets = make([]int, newLength) 163 copy(w.endOffsets, oldArray) 164 165 oldArray = w.numStoredFields 166 w.numStoredFields = make([]int, newLength) 167 copy(w.numStoredFields, oldArray) 168 } 169 w.numStoredFields[w.numBufferedDocs] = w.numStoredFieldsInDoc 170 w.numStoredFieldsInDoc = 0 171 w.endOffsets[w.numBufferedDocs] = w.bufferedDocs.length 172 w.numBufferedDocs++ 173 if w.triggerFlush() { 174 return w.flush() 175 } 176 return nil 177 } 178 179 func saveInts(values []int, out DataOutput) error { 180 length := len(values) 181 assert(length > 0) 182 if length == 1 { 183 return out.WriteVInt(int32(values[0])) 184 } 185 186 var allEqual = true 187 var sentinel = values[0] 188 for _, v := range values[1:] { 189 if v != sentinel { 190 allEqual = false 191 break 192 } 193 } 194 if allEqual { 195 err := out.WriteVInt(0) 196 if err == nil { 197 err = out.WriteVInt(int32(values[0])) 198 } 199 return err 200 } 201 202 var max int64 = 0 203 for _, v := range values { 204 max |= int64(v) 205 } 206 var bitsRequired = packed.BitsRequired(max) 207 err := out.WriteVInt(int32(bitsRequired)) 208 if err != nil { 209 return err 210 } 211 212 w := packed.WriterNoHeader(out, packed.PackedFormat(packed.PACKED), length, bitsRequired, 1) 213 for _, v := range values { 214 if err = w.Add(int64(v)); err != nil { 215 return err 216 } 217 } 218 return w.Finish() 219 } 220 221 func (w *CompressingStoredFieldsWriter) writeHeader(docBase, 222 numBufferedDocs int, numStoredFields, lengths []int) error { 223 224 // save docBase and numBufferedDocs 225 err := w.fieldsStream.WriteVInt(int32(docBase)) // TODO precision loss risk 226 if err == nil { 227 err = w.fieldsStream.WriteVInt(int32(numBufferedDocs)) // TODO precision loss risk 228 if err == nil { 229 // save numStoredFields 230 err = saveInts(numStoredFields[:numBufferedDocs], w.fieldsStream) 231 if err == nil { 232 // save lengths 233 err = saveInts(lengths[:numBufferedDocs], w.fieldsStream) 234 } 235 } 236 } 237 return err 238 } 239 240 func (w *CompressingStoredFieldsWriter) triggerFlush() bool { 241 return w.bufferedDocs.length >= w.chunkSize || // chunks of at least chunkSize bytes 242 w.numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK 243 } 244 245 func (w *CompressingStoredFieldsWriter) flush() error { 246 err := w.indexWriter.writeIndex(w.numBufferedDocs, w.fieldsStream.FilePointer()) 247 if err != nil { 248 return err 249 } 250 251 // transform end offsets into lengths 252 lengths := w.endOffsets 253 for i := w.numBufferedDocs - 1; i > 0; i-- { 254 lengths[i] = w.endOffsets[i] - w.endOffsets[i-1] 255 assert(lengths[i] >= 0) 256 } 257 err = w.writeHeader(w.docBase, w.numBufferedDocs, w.numStoredFields, lengths) 258 if err != nil { 259 return err 260 } 261 262 // compress stored fields to fieldsStream 263 if w.bufferedDocs.length >= 2*w.chunkSize { 264 // big chunk, slice it 265 for compressed := 0; compressed < w.bufferedDocs.length; compressed += w.chunkSize { 266 size := w.bufferedDocs.length - compressed 267 if w.chunkSize < size { 268 size = w.chunkSize 269 } 270 err = w.compressor(w.bufferedDocs.bytes[compressed:compressed+size], w.fieldsStream) 271 if err != nil { 272 return err 273 } 274 } 275 } else { 276 err = w.compressor(w.bufferedDocs.bytes[:w.bufferedDocs.length], w.fieldsStream) 277 if err != nil { 278 return err 279 } 280 } 281 282 // reset 283 w.docBase += w.numBufferedDocs 284 w.numBufferedDocs = 0 285 w.bufferedDocs.length = 0 286 return nil 287 } 288 289 func (w *CompressingStoredFieldsWriter) WriteField(info *model.FieldInfo, field model.IndexableField) error { 290 w.numStoredFieldsInDoc++ 291 292 bits := 0 293 var bytes []byte 294 var str string 295 296 number := field.NumericValue() 297 if number != nil { 298 switch t := number.(type) { 299 case int32: 300 bits = NUMERIC_INT 301 case int64: 302 bits = NUMERIC_LONG 303 case float32: 304 bits = NUMERIC_FLOAT 305 case float64: 306 bits = NUMERIC_DOUBLE 307 default: 308 panic(fmt.Sprintf("cannot store numeric value %v of type %v", number, t)) 309 } 310 } else { 311 bytes = field.BinaryValue() 312 if bytes != nil { 313 bits = BYTE_ARR 314 } else { 315 bits = STRING 316 str = field.StringValue() 317 assert2(str != "", 318 "field %v is stored but does not have binaryValue, stringValue nor numericValue", 319 field.Name()) 320 } 321 } 322 323 infoAndBits := (int64(info.Number) << uint(TYPE_BITS)) | int64(bits) 324 err := w.bufferedDocs.WriteVLong(infoAndBits) 325 if err != nil { 326 return err 327 } 328 329 switch { 330 case bytes != nil: 331 err = w.bufferedDocs.WriteVInt(int32(len(bytes))) 332 if err == nil { 333 err = w.bufferedDocs.WriteBytes(bytes) 334 } 335 case str != "": 336 err = w.bufferedDocs.WriteString(str) 337 case bits == NUMERIC_INT: 338 err = w.bufferedDocs.WriteInt(number.(int32)) 339 case bits == NUMERIC_LONG: 340 err = w.bufferedDocs.WriteLong(number.(int64)) 341 case bits == NUMERIC_FLOAT: 342 err = w.bufferedDocs.WriteInt(int32(math.Float32bits(number.(float32)))) 343 case bits == NUMERIC_DOUBLE: 344 err = w.bufferedDocs.WriteLong(int64(math.Float64bits(number.(float64)))) 345 default: 346 panic("Cannot get here") 347 } 348 return err 349 } 350 351 func (w *CompressingStoredFieldsWriter) Abort() { 352 if w == nil { // tolerate early released pointer 353 return 354 } 355 util.CloseWhileSuppressingError(w) 356 util.DeleteFilesIgnoringErrors(w.directory, 357 util.SegmentFileName(w.segment, w.segmentSuffix, lucene40.FIELDS_EXTENSION), 358 util.SegmentFileName(w.segment, w.segmentSuffix, lucene40.FIELDS_INDEX_EXTENSION)) 359 } 360 361 func (w *CompressingStoredFieldsWriter) Finish(fis model.FieldInfos, numDocs int) (err error) { 362 if w == nil { 363 return errors.New("Nil class pointer encountered.") 364 } 365 assert2(w.indexWriter != nil, "already closed?") 366 if w.numBufferedDocs > 0 { 367 if err = w.flush(); err != nil { 368 return err 369 } 370 } else { 371 assert(w.bufferedDocs.length == 0) 372 } 373 assert2(w.docBase == numDocs, 374 "Wrote %v docs, finish called with numDocs=%v", w.docBase, numDocs) 375 if err = w.indexWriter.finish(numDocs, w.fieldsStream.FilePointer()); err != nil { 376 return err 377 } 378 if err = codec.WriteFooter(w.fieldsStream); err != nil { 379 return err 380 } 381 assert(w.bufferedDocs.length == 0) 382 return nil 383 } 384 385 // util/GrowableByteArrayDataOutput.java 386 387 /* A DataOutput that can be used to build a []byte */ 388 type GrowableByteArrayDataOutput struct { 389 *util.DataOutputImpl 390 bytes []byte 391 length int 392 } 393 394 func newGrowableByteArrayDataOutput(cp int) *GrowableByteArrayDataOutput { 395 ans := &GrowableByteArrayDataOutput{bytes: make([]byte, 0, util.Oversize(cp, 1))} 396 ans.DataOutputImpl = util.NewDataOutput(ans) 397 return ans 398 } 399 400 func (out *GrowableByteArrayDataOutput) WriteByte(b byte) error { 401 assert(out.length <= len(out.bytes)) 402 if out.length < len(out.bytes) { 403 out.bytes[out.length] = b 404 } else { 405 out.bytes = append(out.bytes, b) 406 } 407 out.length++ 408 return nil 409 } 410 411 func (out *GrowableByteArrayDataOutput) WriteBytes(b []byte) error { 412 assert(out.length <= len(out.bytes)) 413 remaining := len(out.bytes) - out.length 414 if remaining > len(b) { 415 copy(out.bytes[out.length:], b) 416 } else if remaining == 0 { 417 out.bytes = append(out.bytes, b...) 418 } else { 419 copy(out.bytes[out.length:], b[:remaining]) 420 out.bytes = append(out.bytes, b[remaining:]...) 421 } 422 out.length += len(b) 423 return nil 424 }