github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/storedFieldsReader.go (about) 1 package compressing 2 3 import ( 4 "errors" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/codec" 7 "github.com/balzaczyy/golucene/core/codec/lucene40" 8 . "github.com/balzaczyy/golucene/core/codec/spi" 9 "github.com/balzaczyy/golucene/core/index/model" 10 "github.com/balzaczyy/golucene/core/store" 11 "github.com/balzaczyy/golucene/core/util" 12 "github.com/balzaczyy/golucene/core/util/packed" 13 ) 14 15 // codec/compressing/CompressingStoredFieldsReader.java 16 17 // Do not reuse the decompression buffer when there is more than 32kb to decompress 18 const BUFFER_REUSE_THRESHOLD = 1 << 15 19 20 // StoredFieldsReader impl for CompressingStoredFieldsFormat 21 type CompressingStoredFieldsReader struct { 22 version int 23 fieldInfos model.FieldInfos 24 indexReader *CompressingStoredFieldsIndexReader 25 maxPointer int64 26 fieldsStream store.IndexInput 27 chunkSize int 28 packedIntsVersion int 29 compressionMode CompressionMode 30 decompressor Decompressor 31 bytes []byte 32 numDocs int 33 closed bool 34 } 35 36 // used by clone 37 func newCompressingStoredFieldsReaderFrom(reader *CompressingStoredFieldsReader) *CompressingStoredFieldsReader { 38 return &CompressingStoredFieldsReader{ 39 version: reader.version, 40 fieldInfos: reader.fieldInfos, 41 fieldsStream: reader.fieldsStream.Clone(), 42 indexReader: reader.indexReader.Clone(), 43 maxPointer: reader.maxPointer, 44 chunkSize: reader.chunkSize, 45 packedIntsVersion: reader.packedIntsVersion, 46 compressionMode: reader.compressionMode, 47 decompressor: reader.compressionMode.NewDecompressor(), 48 numDocs: reader.numDocs, 49 bytes: make([]byte, len(reader.bytes)), 50 closed: false, 51 } 52 } 53 54 // Sole constructor 55 func newCompressingStoredFieldsReader(d store.Directory, 56 si *model.SegmentInfo, segmentSuffix string, 57 fn model.FieldInfos, ctx store.IOContext, formatName string, 58 compressionMode CompressionMode) (r *CompressingStoredFieldsReader, err error) { 59 60 r = &CompressingStoredFieldsReader{} 61 r.compressionMode = compressionMode 62 segment := si.Name 63 r.fieldInfos = fn 64 r.numDocs = si.DocCount() 65 66 var indexStream store.ChecksumIndexInput 67 success := false 68 defer func() { 69 if !success { 70 util.CloseWhileSuppressingError(r, indexStream) 71 } 72 }() 73 74 indexStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_INDEX_EXTENSION) 75 fieldsStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_EXTENSION) 76 // Load the index into memory 77 if indexStream, err = d.OpenChecksumInput(indexStreamFN, ctx); err != nil { 78 return nil, err 79 } 80 codecNameIdx := formatName + CODEC_SFX_IDX 81 if r.version, err = int32AsInt(codec.CheckHeader(indexStream, codecNameIdx, 82 VERSION_START, VERSION_CURRENT)); err != nil { 83 return nil, err 84 } 85 assert(int64(codec.HeaderLength(codecNameIdx)) == indexStream.FilePointer()) 86 if r.indexReader, err = newCompressingStoredFieldsIndexReader(indexStream, si); err != nil { 87 return nil, err 88 } 89 90 var maxPointer int64 = -1 91 92 if r.version >= VERSION_CHECKSUM { 93 if maxPointer, err = indexStream.ReadVLong(); err != nil { 94 return nil, err 95 } 96 if _, err = codec.CheckFooter(indexStream); err != nil { 97 return nil, err 98 } 99 } else { 100 if err = codec.CheckEOF(indexStream); err != nil { 101 return nil, err 102 } 103 } 104 105 if err = indexStream.Close(); err != nil { 106 return nil, err 107 } 108 indexStream = nil 109 110 // Open the data file and read metadata 111 if r.fieldsStream, err = d.OpenInput(fieldsStreamFN, ctx); err != nil { 112 return nil, err 113 } 114 if r.version >= VERSION_CHECKSUM { 115 if maxPointer+codec.FOOTER_LENGTH != r.fieldsStream.Length() { 116 return nil, errors.New(fmt.Sprintf( 117 "Invalid fieldsStream maxPointer (file truncated?): maxPointer=%v, length=%v", 118 maxPointer, r.fieldsStream.Length())) 119 } 120 } else { 121 maxPointer = r.fieldsStream.Length() 122 } 123 r.maxPointer = maxPointer 124 codecNameDat := formatName + CODEC_SFX_DAT 125 var fieldsVersion int 126 if fieldsVersion, err = int32AsInt(codec.CheckHeader(r.fieldsStream, 127 codecNameDat, VERSION_START, VERSION_CURRENT)); err != nil { 128 return nil, err 129 } 130 assert2(r.version == fieldsVersion, 131 "Version mismatch between stored fields index and data: %v != %v", 132 r.version, fieldsVersion) 133 assert(int64(codec.HeaderLength(codecNameDat)) == r.fieldsStream.FilePointer()) 134 135 r.chunkSize = -1 136 if r.version >= VERSION_BIG_CHUNKS { 137 if r.chunkSize, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { 138 return nil, err 139 } 140 } 141 142 if r.packedIntsVersion, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { 143 return nil, err 144 } 145 r.decompressor = compressionMode.NewDecompressor() 146 r.bytes = make([]byte, 0) 147 148 if r.version >= VERSION_CHECKSUM { 149 // NOTE: data file is too costly to verify checksum against all the 150 // bytes on open, but fo rnow we at least verify proper structure 151 // of the checksum footer: which looks for FOOTER_MATIC + 152 // algorithmID. This is cheap and can detect some forms of 153 // corruption such as file trucation. 154 if _, err = codec.RetrieveChecksum(r.fieldsStream); err != nil { 155 return nil, err 156 } 157 } 158 159 success = true 160 return r, nil 161 } 162 163 func int32AsInt(n int32, err error) (int, error) { 164 return int(n), err 165 } 166 167 func (r *CompressingStoredFieldsReader) ensureOpen() { 168 assert2(!r.closed, "this FieldsReader is closed") 169 } 170 171 // Close the underlying IndexInputs 172 func (r *CompressingStoredFieldsReader) Close() (err error) { 173 if !r.closed { 174 if err = util.Close(r.fieldsStream); err == nil { 175 r.closed = true 176 } 177 } 178 return 179 } 180 181 func (r *CompressingStoredFieldsReader) readField(in util.DataInput, 182 visitor StoredFieldVisitor, info *model.FieldInfo, bits int) (err error) { 183 switch bits & TYPE_MASK { 184 case BYTE_ARR: 185 panic("not implemented yet") 186 case STRING: 187 var length int 188 if length, err = int32AsInt(in.ReadVInt()); err != nil { 189 return err 190 } 191 data := make([]byte, length) 192 if err = in.ReadBytes(data); err != nil { 193 return err 194 } 195 visitor.StringField(info, string(data)) 196 case NUMERIC_INT: 197 panic("not implemented yet") 198 case NUMERIC_FLOAT: 199 panic("not implemented yet") 200 case NUMERIC_LONG: 201 panic("not implemented yet") 202 case NUMERIC_DOUBLE: 203 panic("not implemented yet") 204 default: 205 panic(fmt.Sprintf("Unknown type flag: %x", bits)) 206 } 207 return nil 208 } 209 210 func (r *CompressingStoredFieldsReader) VisitDocument(docID int, visitor StoredFieldVisitor) error { 211 err := r.fieldsStream.Seek(r.indexReader.startPointer(docID)) 212 if err != nil { 213 return err 214 } 215 216 docBase, err := int32AsInt(r.fieldsStream.ReadVInt()) 217 if err != nil { 218 return err 219 } 220 chunkDocs, err := int32AsInt(r.fieldsStream.ReadVInt()) 221 if err != nil { 222 return err 223 } 224 if docID < docBase || 225 docID >= docBase+chunkDocs || 226 docBase+chunkDocs > r.numDocs { 227 return errors.New(fmt.Sprintf( 228 "Corrupted: docID=%v, docBase=%v, chunkDocs=%v, numDocs=%v (resource=%v)", 229 docID, docBase, chunkDocs, r.numDocs, r.fieldsStream)) 230 } 231 232 var numStoredFields, offset, length, totalLength int 233 if chunkDocs == 1 { 234 if numStoredFields, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { 235 return err 236 } 237 offset = 0 238 if length, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { 239 return err 240 } 241 totalLength = length 242 } else { 243 bitsPerStoredFields, err := int32AsInt(r.fieldsStream.ReadVInt()) 244 if err != nil { 245 return err 246 } 247 if bitsPerStoredFields == 0 { 248 numStoredFields, err = int32AsInt(r.fieldsStream.ReadVInt()) 249 if err != nil { 250 return err 251 } 252 } else if bitsPerStoredFields > 31 { 253 return errors.New(fmt.Sprintf("bitsPerStoredFields=%v (resource=%v)", 254 bitsPerStoredFields, r.fieldsStream)) 255 } else { 256 panic("not implemented yet") 257 } 258 259 bitsPerLength, err := int32AsInt(r.fieldsStream.ReadVInt()) 260 if err != nil { 261 return err 262 } 263 if bitsPerLength == 0 { 264 if length, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { 265 return err 266 } 267 offset = (docID - docBase) * length 268 totalLength = chunkDocs * length 269 } else if bitsPerLength > 31 { 270 return errors.New(fmt.Sprintf("bitsPerLength=%v (resource=%v)", 271 bitsPerLength, r.fieldsStream)) 272 } else { 273 it := packed.ReaderIteratorNoHeader( 274 r.fieldsStream, packed.PackedFormat(packed.PACKED), r.packedIntsVersion, 275 chunkDocs, bitsPerLength, 1) 276 var n int64 277 off := 0 278 for i := 0; i < docID-docBase; i++ { 279 if n, err = it.Next(); err != nil { 280 return err 281 } 282 off += int(n) 283 } 284 offset = off 285 if n, err = it.Next(); err != nil { 286 return err 287 } 288 length = int(n) 289 off += length 290 for i := docID - docBase + 1; i < chunkDocs; i++ { 291 if n, err = it.Next(); err != nil { 292 return err 293 } 294 off += int(n) 295 } 296 totalLength = off 297 } 298 } 299 300 if (length == 0) != (numStoredFields == 0) { 301 return errors.New(fmt.Sprintf( 302 "length=%v, numStoredFields=%v (resource=%v)", 303 length, numStoredFields, r.fieldsStream)) 304 } 305 if numStoredFields == 0 { 306 // nothing to do 307 return nil 308 } 309 310 var documentInput util.DataInput 311 if r.version >= VERSION_BIG_CHUNKS && totalLength >= 2*r.chunkSize { 312 panic("not implemented yet") 313 } else { 314 var bytes []byte 315 if totalLength <= BUFFER_REUSE_THRESHOLD { 316 bytes = r.bytes 317 } else { 318 bytes = make([]byte, 0) 319 } 320 bytes, err = r.decompressor(r.fieldsStream, totalLength, offset, length, bytes) 321 if err != nil { 322 return err 323 } 324 assert(len(bytes) == length) 325 documentInput = store.NewByteArrayDataInput(bytes) 326 } 327 328 for fieldIDX := 0; fieldIDX < numStoredFields; fieldIDX++ { 329 infoAndBits, err := documentInput.ReadVLong() 330 if err != nil { 331 return err 332 } 333 fieldNumber := int(uint64(infoAndBits) >> uint64(TYPE_BITS)) 334 fieldInfo := r.fieldInfos.FieldInfoByNumber(fieldNumber) 335 336 bits := int(infoAndBits & int64(TYPE_MASK)) 337 assertWithMessage(bits <= NUMERIC_DOUBLE, fmt.Sprintf("bits=%x", bits)) 338 339 status, err := visitor.NeedsField(fieldInfo) 340 if err != nil { 341 return err 342 } 343 switch status { 344 case STORED_FIELD_VISITOR_STATUS_YES: 345 r.readField(documentInput, visitor, fieldInfo, bits) 346 case STORED_FIELD_VISITOR_STATUS_NO: 347 panic("not implemented yet") 348 case STORED_FIELD_VISITOR_STATUS_STOP: 349 return nil 350 } 351 } 352 353 return nil 354 } 355 356 func assertWithMessage(ok bool, msg string) { 357 if !ok { 358 panic(msg) 359 } 360 } 361 362 func (r *CompressingStoredFieldsReader) Clone() StoredFieldsReader { 363 r.ensureOpen() 364 return newCompressingStoredFieldsReaderFrom(r) 365 }