github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/storedFieldsWriter.go (about)

     1  package compressing
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/codec"
     7  	"github.com/balzaczyy/golucene/core/codec/lucene40"
     8  	"github.com/balzaczyy/golucene/core/index/model"
     9  	"github.com/balzaczyy/golucene/core/store"
    10  	"github.com/balzaczyy/golucene/core/util"
    11  	"github.com/balzaczyy/golucene/core/util/packed"
    12  	"math"
    13  )
    14  
    15  /* hard limit on the maximum number of documents per chunk */
    16  const MAX_DOCUMENTS_PER_CHUNK = 128
    17  
    18  const (
    19  	STRING         = 0x00
    20  	BYTE_ARR       = 0x01
    21  	NUMERIC_INT    = 0x02
    22  	NUMERIC_FLOAT  = 0x03
    23  	NUMERIC_LONG   = 0x04
    24  	NUMERIC_DOUBLE = 0x05
    25  )
    26  
    27  var (
    28  	TYPE_BITS = packed.BitsRequired(NUMERIC_DOUBLE)
    29  	TYPE_MASK = int(packed.MaxValue(TYPE_BITS))
    30  )
    31  
    32  const (
    33  	CODEC_SFX_IDX      = "Index"
    34  	CODEC_SFX_DAT      = "Data"
    35  	VERSION_START      = 0
    36  	VERSION_BIG_CHUNKS = 1
    37  	VERSION_CHECKSUM   = 2
    38  	VERSION_CURRENT    = VERSION_CHECKSUM
    39  )
    40  
    41  /* StoredFieldsWriter impl for CompressingStoredFieldsFormat */
    42  type CompressingStoredFieldsWriter struct {
    43  	directory     store.Directory
    44  	segment       string
    45  	segmentSuffix string
    46  	indexWriter   *StoredFieldsIndexWriter
    47  	fieldsStream  store.IndexOutput
    48  
    49  	compressionMode CompressionMode
    50  	compressor      Compressor
    51  	chunkSize       int
    52  
    53  	bufferedDocs    *GrowableByteArrayDataOutput
    54  	numStoredFields []int // number of stored fields
    55  	endOffsets      []int // ned offsets in bufferedDocs
    56  	docBase         int   // doc ID at the beginning of the chunk
    57  	numBufferedDocs int   // docBase + numBufferedDocs == current doc ID
    58  
    59  	numStoredFieldsInDoc int
    60  }
    61  
    62  func NewCompressingStoredFieldsWriter(dir store.Directory, si *model.SegmentInfo,
    63  	segmentSuffix string, ctx store.IOContext, formatName string,
    64  	compressionMode CompressionMode, chunkSize int) (*CompressingStoredFieldsWriter, error) {
    65  
    66  	assert(dir != nil)
    67  	ans := &CompressingStoredFieldsWriter{
    68  		directory:       dir,
    69  		segment:         si.Name,
    70  		segmentSuffix:   segmentSuffix,
    71  		compressionMode: compressionMode,
    72  		compressor:      compressionMode.NewCompressor(),
    73  		chunkSize:       chunkSize,
    74  		docBase:         0,
    75  		bufferedDocs:    newGrowableByteArrayDataOutput(chunkSize),
    76  		numStoredFields: make([]int, 16),
    77  		endOffsets:      make([]int, 16),
    78  		numBufferedDocs: 0,
    79  	}
    80  
    81  	var success = false
    82  	indexStream, err := dir.CreateOutput(util.SegmentFileName(si.Name, segmentSuffix,
    83  		lucene40.FIELDS_INDEX_EXTENSION), ctx)
    84  	if err != nil {
    85  		return nil, err
    86  	}
    87  	assert(indexStream != nil)
    88  	defer func() {
    89  		if !success {
    90  			util.CloseWhileSuppressingError(indexStream)
    91  			ans.Abort()
    92  		}
    93  	}()
    94  
    95  	ans.fieldsStream, err = dir.CreateOutput(util.SegmentFileName(si.Name, segmentSuffix,
    96  		lucene40.FIELDS_EXTENSION), ctx)
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  
   101  	codecNameIdx := formatName + CODEC_SFX_IDX
   102  	codecNameDat := formatName + CODEC_SFX_DAT
   103  	err = codec.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT)
   104  	if err != nil {
   105  		return nil, err
   106  	}
   107  	err = codec.WriteHeader(ans.fieldsStream, codecNameDat, VERSION_CURRENT)
   108  	if err != nil {
   109  		return nil, err
   110  	}
   111  	assert(int64(codec.HeaderLength(codecNameIdx)) == indexStream.FilePointer())
   112  	assert(int64(codec.HeaderLength(codecNameDat)) == ans.fieldsStream.FilePointer())
   113  
   114  	ans.indexWriter, err = NewStoredFieldsIndexWriter(indexStream)
   115  	if err != nil {
   116  		return nil, err
   117  	}
   118  	assert(ans.indexWriter != nil)
   119  	indexStream = nil
   120  
   121  	err = ans.fieldsStream.WriteVInt(int32(chunkSize))
   122  	if err != nil {
   123  		return nil, err
   124  	}
   125  	err = ans.fieldsStream.WriteVInt(packed.VERSION_CURRENT)
   126  	if err != nil {
   127  		return nil, err
   128  	}
   129  
   130  	success = true
   131  	return ans, nil
   132  }
   133  
   134  func assert(ok bool) {
   135  	assert2(ok, "assert fail")
   136  }
   137  
   138  func assert2(ok bool, msg string, args ...interface{}) {
   139  	if !ok {
   140  		panic(fmt.Sprintf(msg, args...))
   141  	}
   142  }
   143  
   144  func (w *CompressingStoredFieldsWriter) Close() error {
   145  	assert(w != nil)
   146  	defer func() {
   147  		if w != nil {
   148  			w.fieldsStream = nil
   149  			w.indexWriter = nil
   150  		}
   151  	}()
   152  	return util.Close(w.fieldsStream, w.indexWriter)
   153  }
   154  
   155  func (w *CompressingStoredFieldsWriter) StartDocument() error { return nil }
   156  
   157  func (w *CompressingStoredFieldsWriter) FinishDocument() error {
   158  	if w.numBufferedDocs == len(w.numStoredFields) {
   159  		newLength := util.Oversize(w.numBufferedDocs+1, 4)
   160  
   161  		oldArray := w.endOffsets
   162  		w.endOffsets = make([]int, newLength)
   163  		copy(w.endOffsets, oldArray)
   164  
   165  		oldArray = w.numStoredFields
   166  		w.numStoredFields = make([]int, newLength)
   167  		copy(w.numStoredFields, oldArray)
   168  	}
   169  	w.numStoredFields[w.numBufferedDocs] = w.numStoredFieldsInDoc
   170  	w.numStoredFieldsInDoc = 0
   171  	w.endOffsets[w.numBufferedDocs] = w.bufferedDocs.length
   172  	w.numBufferedDocs++
   173  	if w.triggerFlush() {
   174  		return w.flush()
   175  	}
   176  	return nil
   177  }
   178  
   179  func saveInts(values []int, out DataOutput) error {
   180  	length := len(values)
   181  	assert(length > 0)
   182  	if length == 1 {
   183  		return out.WriteVInt(int32(values[0]))
   184  	}
   185  
   186  	var allEqual = true
   187  	var sentinel = values[0]
   188  	for _, v := range values[1:] {
   189  		if v != sentinel {
   190  			allEqual = false
   191  			break
   192  		}
   193  	}
   194  	if allEqual {
   195  		err := out.WriteVInt(0)
   196  		if err == nil {
   197  			err = out.WriteVInt(int32(values[0]))
   198  		}
   199  		return err
   200  	}
   201  
   202  	var max int64 = 0
   203  	for _, v := range values {
   204  		max |= int64(v)
   205  	}
   206  	var bitsRequired = packed.BitsRequired(max)
   207  	err := out.WriteVInt(int32(bitsRequired))
   208  	if err != nil {
   209  		return err
   210  	}
   211  
   212  	w := packed.WriterNoHeader(out, packed.PackedFormat(packed.PACKED), length, bitsRequired, 1)
   213  	for _, v := range values {
   214  		if err = w.Add(int64(v)); err != nil {
   215  			return err
   216  		}
   217  	}
   218  	return w.Finish()
   219  }
   220  
   221  func (w *CompressingStoredFieldsWriter) writeHeader(docBase,
   222  	numBufferedDocs int, numStoredFields, lengths []int) error {
   223  
   224  	// save docBase and numBufferedDocs
   225  	err := w.fieldsStream.WriteVInt(int32(docBase)) // TODO precision loss risk
   226  	if err == nil {
   227  		err = w.fieldsStream.WriteVInt(int32(numBufferedDocs)) // TODO precision loss risk
   228  		if err == nil {
   229  			// save numStoredFields
   230  			err = saveInts(numStoredFields[:numBufferedDocs], w.fieldsStream)
   231  			if err == nil {
   232  				// save lengths
   233  				err = saveInts(lengths[:numBufferedDocs], w.fieldsStream)
   234  			}
   235  		}
   236  	}
   237  	return err
   238  }
   239  
   240  func (w *CompressingStoredFieldsWriter) triggerFlush() bool {
   241  	return w.bufferedDocs.length >= w.chunkSize || // chunks of at least chunkSize bytes
   242  		w.numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK
   243  }
   244  
   245  func (w *CompressingStoredFieldsWriter) flush() error {
   246  	err := w.indexWriter.writeIndex(w.numBufferedDocs, w.fieldsStream.FilePointer())
   247  	if err != nil {
   248  		return err
   249  	}
   250  
   251  	// transform end offsets into lengths
   252  	lengths := w.endOffsets
   253  	for i := w.numBufferedDocs - 1; i > 0; i-- {
   254  		lengths[i] = w.endOffsets[i] - w.endOffsets[i-1]
   255  		assert(lengths[i] >= 0)
   256  	}
   257  	err = w.writeHeader(w.docBase, w.numBufferedDocs, w.numStoredFields, lengths)
   258  	if err != nil {
   259  		return err
   260  	}
   261  
   262  	// compress stored fields to fieldsStream
   263  	if w.bufferedDocs.length >= 2*w.chunkSize {
   264  		// big chunk, slice it
   265  		for compressed := 0; compressed < w.bufferedDocs.length; compressed += w.chunkSize {
   266  			size := w.bufferedDocs.length - compressed
   267  			if w.chunkSize < size {
   268  				size = w.chunkSize
   269  			}
   270  			err = w.compressor(w.bufferedDocs.bytes[compressed:compressed+size], w.fieldsStream)
   271  			if err != nil {
   272  				return err
   273  			}
   274  		}
   275  	} else {
   276  		err = w.compressor(w.bufferedDocs.bytes[:w.bufferedDocs.length], w.fieldsStream)
   277  		if err != nil {
   278  			return err
   279  		}
   280  	}
   281  
   282  	// reset
   283  	w.docBase += w.numBufferedDocs
   284  	w.numBufferedDocs = 0
   285  	w.bufferedDocs.length = 0
   286  	return nil
   287  }
   288  
   289  func (w *CompressingStoredFieldsWriter) WriteField(info *model.FieldInfo, field model.IndexableField) error {
   290  	w.numStoredFieldsInDoc++
   291  
   292  	bits := 0
   293  	var bytes []byte
   294  	var str string
   295  
   296  	number := field.NumericValue()
   297  	if number != nil {
   298  		switch t := number.(type) {
   299  		case int32:
   300  			bits = NUMERIC_INT
   301  		case int64:
   302  			bits = NUMERIC_LONG
   303  		case float32:
   304  			bits = NUMERIC_FLOAT
   305  		case float64:
   306  			bits = NUMERIC_DOUBLE
   307  		default:
   308  			panic(fmt.Sprintf("cannot store numeric value %v of type %v", number, t))
   309  		}
   310  	} else {
   311  		bytes = field.BinaryValue()
   312  		if bytes != nil {
   313  			bits = BYTE_ARR
   314  		} else {
   315  			bits = STRING
   316  			str = field.StringValue()
   317  			assert2(str != "",
   318  				"field %v is stored but does not have binaryValue, stringValue nor numericValue",
   319  				field.Name())
   320  		}
   321  	}
   322  
   323  	infoAndBits := (int64(info.Number) << uint(TYPE_BITS)) | int64(bits)
   324  	err := w.bufferedDocs.WriteVLong(infoAndBits)
   325  	if err != nil {
   326  		return err
   327  	}
   328  
   329  	switch {
   330  	case bytes != nil:
   331  		err = w.bufferedDocs.WriteVInt(int32(len(bytes)))
   332  		if err == nil {
   333  			err = w.bufferedDocs.WriteBytes(bytes)
   334  		}
   335  	case str != "":
   336  		err = w.bufferedDocs.WriteString(str)
   337  	case bits == NUMERIC_INT:
   338  		err = w.bufferedDocs.WriteInt(number.(int32))
   339  	case bits == NUMERIC_LONG:
   340  		err = w.bufferedDocs.WriteLong(number.(int64))
   341  	case bits == NUMERIC_FLOAT:
   342  		err = w.bufferedDocs.WriteInt(int32(math.Float32bits(number.(float32))))
   343  	case bits == NUMERIC_DOUBLE:
   344  		err = w.bufferedDocs.WriteLong(int64(math.Float64bits(number.(float64))))
   345  	default:
   346  		panic("Cannot get here")
   347  	}
   348  	return err
   349  }
   350  
   351  func (w *CompressingStoredFieldsWriter) Abort() {
   352  	if w == nil { // tolerate early released pointer
   353  		return
   354  	}
   355  	util.CloseWhileSuppressingError(w)
   356  	util.DeleteFilesIgnoringErrors(w.directory,
   357  		util.SegmentFileName(w.segment, w.segmentSuffix, lucene40.FIELDS_EXTENSION),
   358  		util.SegmentFileName(w.segment, w.segmentSuffix, lucene40.FIELDS_INDEX_EXTENSION))
   359  }
   360  
   361  func (w *CompressingStoredFieldsWriter) Finish(fis model.FieldInfos, numDocs int) (err error) {
   362  	if w == nil {
   363  		return errors.New("Nil class pointer encountered.")
   364  	}
   365  	assert2(w.indexWriter != nil, "already closed?")
   366  	if w.numBufferedDocs > 0 {
   367  		if err = w.flush(); err != nil {
   368  			return err
   369  		}
   370  	} else {
   371  		assert(w.bufferedDocs.length == 0)
   372  	}
   373  	assert2(w.docBase == numDocs,
   374  		"Wrote %v docs, finish called with numDocs=%v", w.docBase, numDocs)
   375  	if err = w.indexWriter.finish(numDocs, w.fieldsStream.FilePointer()); err != nil {
   376  		return err
   377  	}
   378  	if err = codec.WriteFooter(w.fieldsStream); err != nil {
   379  		return err
   380  	}
   381  	assert(w.bufferedDocs.length == 0)
   382  	return nil
   383  }
   384  
   385  // util/GrowableByteArrayDataOutput.java
   386  
   387  /* A DataOutput that can be used to build a []byte */
   388  type GrowableByteArrayDataOutput struct {
   389  	*util.DataOutputImpl
   390  	bytes  []byte
   391  	length int
   392  }
   393  
   394  func newGrowableByteArrayDataOutput(cp int) *GrowableByteArrayDataOutput {
   395  	ans := &GrowableByteArrayDataOutput{bytes: make([]byte, 0, util.Oversize(cp, 1))}
   396  	ans.DataOutputImpl = util.NewDataOutput(ans)
   397  	return ans
   398  }
   399  
   400  func (out *GrowableByteArrayDataOutput) WriteByte(b byte) error {
   401  	assert(out.length <= len(out.bytes))
   402  	if out.length < len(out.bytes) {
   403  		out.bytes[out.length] = b
   404  	} else {
   405  		out.bytes = append(out.bytes, b)
   406  	}
   407  	out.length++
   408  	return nil
   409  }
   410  
   411  func (out *GrowableByteArrayDataOutput) WriteBytes(b []byte) error {
   412  	assert(out.length <= len(out.bytes))
   413  	remaining := len(out.bytes) - out.length
   414  	if remaining > len(b) {
   415  		copy(out.bytes[out.length:], b)
   416  	} else if remaining == 0 {
   417  		out.bytes = append(out.bytes, b...)
   418  	} else {
   419  		copy(out.bytes[out.length:], b[:remaining])
   420  		out.bytes = append(out.bytes, b[remaining:]...)
   421  	}
   422  	out.length += len(b)
   423  	return nil
   424  }