github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/storedFieldsReader.go (about)

     1  package compressing
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/codec"
     7  	"github.com/balzaczyy/golucene/core/codec/lucene40"
     8  	. "github.com/balzaczyy/golucene/core/codec/spi"
     9  	"github.com/balzaczyy/golucene/core/index/model"
    10  	"github.com/balzaczyy/golucene/core/store"
    11  	"github.com/balzaczyy/golucene/core/util"
    12  	"github.com/balzaczyy/golucene/core/util/packed"
    13  )
    14  
    15  // codec/compressing/CompressingStoredFieldsReader.java
    16  
    17  // Do not reuse the decompression buffer when there is more than 32kb to decompress
    18  const BUFFER_REUSE_THRESHOLD = 1 << 15
    19  
    20  // StoredFieldsReader impl for CompressingStoredFieldsFormat
    21  type CompressingStoredFieldsReader struct {
    22  	version           int
    23  	fieldInfos        model.FieldInfos
    24  	indexReader       *CompressingStoredFieldsIndexReader
    25  	maxPointer        int64
    26  	fieldsStream      store.IndexInput
    27  	chunkSize         int
    28  	packedIntsVersion int
    29  	compressionMode   CompressionMode
    30  	decompressor      Decompressor
    31  	bytes             []byte
    32  	numDocs           int
    33  	closed            bool
    34  }
    35  
    36  // used by clone
    37  func newCompressingStoredFieldsReaderFrom(reader *CompressingStoredFieldsReader) *CompressingStoredFieldsReader {
    38  	return &CompressingStoredFieldsReader{
    39  		version:           reader.version,
    40  		fieldInfos:        reader.fieldInfos,
    41  		fieldsStream:      reader.fieldsStream.Clone(),
    42  		indexReader:       reader.indexReader.Clone(),
    43  		maxPointer:        reader.maxPointer,
    44  		chunkSize:         reader.chunkSize,
    45  		packedIntsVersion: reader.packedIntsVersion,
    46  		compressionMode:   reader.compressionMode,
    47  		decompressor:      reader.compressionMode.NewDecompressor(),
    48  		numDocs:           reader.numDocs,
    49  		bytes:             make([]byte, len(reader.bytes)),
    50  		closed:            false,
    51  	}
    52  }
    53  
    54  // Sole constructor
    55  func newCompressingStoredFieldsReader(d store.Directory,
    56  	si *model.SegmentInfo, segmentSuffix string,
    57  	fn model.FieldInfos, ctx store.IOContext, formatName string,
    58  	compressionMode CompressionMode) (r *CompressingStoredFieldsReader, err error) {
    59  
    60  	r = &CompressingStoredFieldsReader{}
    61  	r.compressionMode = compressionMode
    62  	segment := si.Name
    63  	r.fieldInfos = fn
    64  	r.numDocs = si.DocCount()
    65  
    66  	var indexStream store.ChecksumIndexInput
    67  	success := false
    68  	defer func() {
    69  		if !success {
    70  			util.CloseWhileSuppressingError(r, indexStream)
    71  		}
    72  	}()
    73  
    74  	indexStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_INDEX_EXTENSION)
    75  	fieldsStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_EXTENSION)
    76  	// Load the index into memory
    77  	if indexStream, err = d.OpenChecksumInput(indexStreamFN, ctx); err != nil {
    78  		return nil, err
    79  	}
    80  	codecNameIdx := formatName + CODEC_SFX_IDX
    81  	if r.version, err = int32AsInt(codec.CheckHeader(indexStream, codecNameIdx,
    82  		VERSION_START, VERSION_CURRENT)); err != nil {
    83  		return nil, err
    84  	}
    85  	assert(int64(codec.HeaderLength(codecNameIdx)) == indexStream.FilePointer())
    86  	if r.indexReader, err = newCompressingStoredFieldsIndexReader(indexStream, si); err != nil {
    87  		return nil, err
    88  	}
    89  
    90  	var maxPointer int64 = -1
    91  
    92  	if r.version >= VERSION_CHECKSUM {
    93  		if maxPointer, err = indexStream.ReadVLong(); err != nil {
    94  			return nil, err
    95  		}
    96  		if _, err = codec.CheckFooter(indexStream); err != nil {
    97  			return nil, err
    98  		}
    99  	} else {
   100  		if err = codec.CheckEOF(indexStream); err != nil {
   101  			return nil, err
   102  		}
   103  	}
   104  
   105  	if err = indexStream.Close(); err != nil {
   106  		return nil, err
   107  	}
   108  	indexStream = nil
   109  
   110  	// Open the data file and read metadata
   111  	if r.fieldsStream, err = d.OpenInput(fieldsStreamFN, ctx); err != nil {
   112  		return nil, err
   113  	}
   114  	if r.version >= VERSION_CHECKSUM {
   115  		if maxPointer+codec.FOOTER_LENGTH != r.fieldsStream.Length() {
   116  			return nil, errors.New(fmt.Sprintf(
   117  				"Invalid fieldsStream maxPointer (file truncated?): maxPointer=%v, length=%v",
   118  				maxPointer, r.fieldsStream.Length()))
   119  		}
   120  	} else {
   121  		maxPointer = r.fieldsStream.Length()
   122  	}
   123  	r.maxPointer = maxPointer
   124  	codecNameDat := formatName + CODEC_SFX_DAT
   125  	var fieldsVersion int
   126  	if fieldsVersion, err = int32AsInt(codec.CheckHeader(r.fieldsStream,
   127  		codecNameDat, VERSION_START, VERSION_CURRENT)); err != nil {
   128  		return nil, err
   129  	}
   130  	assert2(r.version == fieldsVersion,
   131  		"Version mismatch between stored fields index and data: %v != %v",
   132  		r.version, fieldsVersion)
   133  	assert(int64(codec.HeaderLength(codecNameDat)) == r.fieldsStream.FilePointer())
   134  
   135  	r.chunkSize = -1
   136  	if r.version >= VERSION_BIG_CHUNKS {
   137  		if r.chunkSize, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
   138  			return nil, err
   139  		}
   140  	}
   141  
   142  	if r.packedIntsVersion, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
   143  		return nil, err
   144  	}
   145  	r.decompressor = compressionMode.NewDecompressor()
   146  	r.bytes = make([]byte, 0)
   147  
   148  	if r.version >= VERSION_CHECKSUM {
   149  		// NOTE: data file is too costly to verify checksum against all the
   150  		// bytes on open, but fo rnow we at least verify proper structure
   151  		// of the checksum footer: which looks for FOOTER_MATIC +
   152  		// algorithmID. This is cheap and can detect some forms of
   153  		// corruption such as file trucation.
   154  		if _, err = codec.RetrieveChecksum(r.fieldsStream); err != nil {
   155  			return nil, err
   156  		}
   157  	}
   158  
   159  	success = true
   160  	return r, nil
   161  }
   162  
   163  func int32AsInt(n int32, err error) (int, error) {
   164  	return int(n), err
   165  }
   166  
   167  func (r *CompressingStoredFieldsReader) ensureOpen() {
   168  	assert2(!r.closed, "this FieldsReader is closed")
   169  }
   170  
   171  // Close the underlying IndexInputs
   172  func (r *CompressingStoredFieldsReader) Close() (err error) {
   173  	if !r.closed {
   174  		if err = util.Close(r.fieldsStream); err == nil {
   175  			r.closed = true
   176  		}
   177  	}
   178  	return
   179  }
   180  
   181  func (r *CompressingStoredFieldsReader) readField(in util.DataInput,
   182  	visitor StoredFieldVisitor, info *model.FieldInfo, bits int) (err error) {
   183  	switch bits & TYPE_MASK {
   184  	case BYTE_ARR:
   185  		panic("not implemented yet")
   186  	case STRING:
   187  		var length int
   188  		if length, err = int32AsInt(in.ReadVInt()); err != nil {
   189  			return err
   190  		}
   191  		data := make([]byte, length)
   192  		if err = in.ReadBytes(data); err != nil {
   193  			return err
   194  		}
   195  		visitor.StringField(info, string(data))
   196  	case NUMERIC_INT:
   197  		panic("not implemented yet")
   198  	case NUMERIC_FLOAT:
   199  		panic("not implemented yet")
   200  	case NUMERIC_LONG:
   201  		panic("not implemented yet")
   202  	case NUMERIC_DOUBLE:
   203  		panic("not implemented yet")
   204  	default:
   205  		panic(fmt.Sprintf("Unknown type flag: %x", bits))
   206  	}
   207  	return nil
   208  }
   209  
   210  func (r *CompressingStoredFieldsReader) VisitDocument(docID int, visitor StoredFieldVisitor) error {
   211  	err := r.fieldsStream.Seek(r.indexReader.startPointer(docID))
   212  	if err != nil {
   213  		return err
   214  	}
   215  
   216  	docBase, err := int32AsInt(r.fieldsStream.ReadVInt())
   217  	if err != nil {
   218  		return err
   219  	}
   220  	chunkDocs, err := int32AsInt(r.fieldsStream.ReadVInt())
   221  	if err != nil {
   222  		return err
   223  	}
   224  	if docID < docBase ||
   225  		docID >= docBase+chunkDocs ||
   226  		docBase+chunkDocs > r.numDocs {
   227  		return errors.New(fmt.Sprintf(
   228  			"Corrupted: docID=%v, docBase=%v, chunkDocs=%v, numDocs=%v (resource=%v)",
   229  			docID, docBase, chunkDocs, r.numDocs, r.fieldsStream))
   230  	}
   231  
   232  	var numStoredFields, offset, length, totalLength int
   233  	if chunkDocs == 1 {
   234  		if numStoredFields, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
   235  			return err
   236  		}
   237  		offset = 0
   238  		if length, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
   239  			return err
   240  		}
   241  		totalLength = length
   242  	} else {
   243  		bitsPerStoredFields, err := int32AsInt(r.fieldsStream.ReadVInt())
   244  		if err != nil {
   245  			return err
   246  		}
   247  		if bitsPerStoredFields == 0 {
   248  			numStoredFields, err = int32AsInt(r.fieldsStream.ReadVInt())
   249  			if err != nil {
   250  				return err
   251  			}
   252  		} else if bitsPerStoredFields > 31 {
   253  			return errors.New(fmt.Sprintf("bitsPerStoredFields=%v (resource=%v)",
   254  				bitsPerStoredFields, r.fieldsStream))
   255  		} else {
   256  			panic("not implemented yet")
   257  		}
   258  
   259  		bitsPerLength, err := int32AsInt(r.fieldsStream.ReadVInt())
   260  		if err != nil {
   261  			return err
   262  		}
   263  		if bitsPerLength == 0 {
   264  			if length, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
   265  				return err
   266  			}
   267  			offset = (docID - docBase) * length
   268  			totalLength = chunkDocs * length
   269  		} else if bitsPerLength > 31 {
   270  			return errors.New(fmt.Sprintf("bitsPerLength=%v (resource=%v)",
   271  				bitsPerLength, r.fieldsStream))
   272  		} else {
   273  			it := packed.ReaderIteratorNoHeader(
   274  				r.fieldsStream, packed.PackedFormat(packed.PACKED), r.packedIntsVersion,
   275  				chunkDocs, bitsPerLength, 1)
   276  			var n int64
   277  			off := 0
   278  			for i := 0; i < docID-docBase; i++ {
   279  				if n, err = it.Next(); err != nil {
   280  					return err
   281  				}
   282  				off += int(n)
   283  			}
   284  			offset = off
   285  			if n, err = it.Next(); err != nil {
   286  				return err
   287  			}
   288  			length = int(n)
   289  			off += length
   290  			for i := docID - docBase + 1; i < chunkDocs; i++ {
   291  				if n, err = it.Next(); err != nil {
   292  					return err
   293  				}
   294  				off += int(n)
   295  			}
   296  			totalLength = off
   297  		}
   298  	}
   299  
   300  	if (length == 0) != (numStoredFields == 0) {
   301  		return errors.New(fmt.Sprintf(
   302  			"length=%v, numStoredFields=%v (resource=%v)",
   303  			length, numStoredFields, r.fieldsStream))
   304  	}
   305  	if numStoredFields == 0 {
   306  		// nothing to do
   307  		return nil
   308  	}
   309  
   310  	var documentInput util.DataInput
   311  	if r.version >= VERSION_BIG_CHUNKS && totalLength >= 2*r.chunkSize {
   312  		panic("not implemented yet")
   313  	} else {
   314  		var bytes []byte
   315  		if totalLength <= BUFFER_REUSE_THRESHOLD {
   316  			bytes = r.bytes
   317  		} else {
   318  			bytes = make([]byte, 0)
   319  		}
   320  		bytes, err = r.decompressor(r.fieldsStream, totalLength, offset, length, bytes)
   321  		if err != nil {
   322  			return err
   323  		}
   324  		assert(len(bytes) == length)
   325  		documentInput = store.NewByteArrayDataInput(bytes)
   326  	}
   327  
   328  	for fieldIDX := 0; fieldIDX < numStoredFields; fieldIDX++ {
   329  		infoAndBits, err := documentInput.ReadVLong()
   330  		if err != nil {
   331  			return err
   332  		}
   333  		fieldNumber := int(uint64(infoAndBits) >> uint64(TYPE_BITS))
   334  		fieldInfo := r.fieldInfos.FieldInfoByNumber(fieldNumber)
   335  
   336  		bits := int(infoAndBits & int64(TYPE_MASK))
   337  		assertWithMessage(bits <= NUMERIC_DOUBLE, fmt.Sprintf("bits=%x", bits))
   338  
   339  		status, err := visitor.NeedsField(fieldInfo)
   340  		if err != nil {
   341  			return err
   342  		}
   343  		switch status {
   344  		case STORED_FIELD_VISITOR_STATUS_YES:
   345  			r.readField(documentInput, visitor, fieldInfo, bits)
   346  		case STORED_FIELD_VISITOR_STATUS_NO:
   347  			panic("not implemented yet")
   348  		case STORED_FIELD_VISITOR_STATUS_STOP:
   349  			return nil
   350  		}
   351  	}
   352  
   353  	return nil
   354  }
   355  
   356  func assertWithMessage(ok bool, msg string) {
   357  	if !ok {
   358  		panic(msg)
   359  	}
   360  }
   361  
   362  func (r *CompressingStoredFieldsReader) Clone() StoredFieldsReader {
   363  	r.ensureOpen()
   364  	return newCompressingStoredFieldsReaderFrom(r)
   365  }