github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene42/fieldInfos.go (about)

     1  package lucene42
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/codec"
     7  	. "github.com/balzaczyy/golucene/core/codec/spi"
     8  	. "github.com/balzaczyy/golucene/core/index/model"
     9  	"github.com/balzaczyy/golucene/core/store"
    10  	"github.com/balzaczyy/golucene/core/util"
    11  	"log"
    12  )
    13  
    14  // lucene42/Lucene42FieldInfosFormat.java
    15  
    16  /*
    17  Lucene 4.2 Field Infos format.
    18  
    19  Field names are stored in the field info file, with suffix .fnm.
    20  
    21  FieldInfos (.fnm) --> Header, HeaderCount, <FieldName, FieldNumber,
    22                        FieldBits, DocValuesBits, Attribute>^FieldsCount
    23  
    24  Data types:
    25  - Header --> CodecHeader
    26  - FieldsCount --> VInt
    27  - FieldName --> string
    28  - FieldBits, DocValuesBit --> byte
    29  - FieldNumber --> VInt
    30  - Attributes --> map[string]string
    31  
    32  Field Description:
    33  - FieldsCount: the number of fields in this file.
    34  - FieldName: name of the field as a UTF-8 string.
    35  - FieldNumber: the field's number. NOte that unlike previous versions
    36    of Lucene, the fields are not numbered implicitly by their order in
    37    the file, instead explicitly.
    38  - FieldBits: a byte containing field options.
    39    - The low-order bit is one for indexed fields, and zero for non-indexed
    40      fields.
    41    - The second lowest-order bit is one for fields that have term vectors
    42      stored, and zero for fields without term vectors.
    43    - If the third lowest order-bit is set (0x4), offsets are stored into
    44      the postings list in addition to positions.
    45    - Fourth bit is unsed.
    46    - If the fifth lowest-order bit is set (0x10), norms are omitted for
    47      the indexed field.
    48    - If the sixth lowest-order bit is set (0x20), payloads are stored
    49      for the indexed field.
    50    - If the seventh lowest-order bit is set (0x40), term frequencies a
    51      and ositions omitted for the indexed field.
    52    - If the eighth lowest-order bit is set (0x80), positions are omitted
    53      for the indexed field.
    54  - DocValuesBits: a byte containing per-document value types. The type
    55    recorded as two four-bit intergers, with the high-order bits
    56    representing norms options, and low-order bits representing DocVlaues
    57    options. Each four-bit interger can be decoded as such:
    58    - 0: no DocValues for this field.
    59    - 1: NumericDocValues.
    60    - 2: BinaryDocvalues.
    61    - 3: SortedDocValues.
    62  - Attributes: a key-value map of codec-private attributes.
    63  */
    64  type Lucene42FieldInfosFormat struct {
    65  	reader FieldInfosReader
    66  	// writer FieldInfosWriter
    67  }
    68  
    69  func NewLucene42FieldInfosFormat() *Lucene42FieldInfosFormat {
    70  	return &Lucene42FieldInfosFormat{
    71  		reader: Lucene42FieldInfosReader,
    72  		// writer: Lucene42FieldInfosWriter,
    73  	}
    74  }
    75  
    76  func (f *Lucene42FieldInfosFormat) FieldInfosReader() FieldInfosReader {
    77  	return f.reader
    78  }
    79  
    80  func (f *Lucene42FieldInfosFormat) FieldInfosWriter() FieldInfosWriter {
    81  	panic("this codec can only be used for reading")
    82  	// return f.writer
    83  }
    84  
    85  const (
    86  	// Extension of field infos
    87  	LUCENE42_FI_EXTENSION = "fnm"
    88  
    89  	// Codec header
    90  	LUCENE42_FI_CODEC_NAME     = "Lucene42FieldInfos"
    91  	LUCENE42_FI_FORMAT_START   = 0
    92  	LUCENE42_FI_FORMAT_CURRENT = LUCENE42_FI_FORMAT_START
    93  
    94  	// Field flags
    95  	LUCENE42_FI_IS_INDEXED                   = 0x1
    96  	LUCENE42_FI_STORE_TERMVECTOR             = 0x2
    97  	LUCENE42_FI_STORE_OFFSETS_IN_POSTINGS    = 0x4
    98  	LUCENE42_FI_OMIT_NORMS                   = 0x10
    99  	LUCENE42_FI_STORE_PAYLOADS               = 0x20
   100  	LUCENE42_FI_OMIT_TERM_FREQ_AND_POSITIONS = 0x40
   101  	LUCENE42_FI_OMIT_POSITIONS               = 0x80
   102  )
   103  
   104  var Lucene42FieldInfosReader = func(dir store.Directory,
   105  	segment, suffix string, context store.IOContext) (fi FieldInfos, err error) {
   106  
   107  	log.Printf("Reading FieldInfos from %v...", dir)
   108  	fi = FieldInfos{}
   109  	fileName := util.SegmentFileName(segment, "", LUCENE42_FI_EXTENSION)
   110  	log.Printf("Segment: %v", fileName)
   111  	input, err := dir.OpenInput(fileName, context)
   112  	if err != nil {
   113  		return fi, err
   114  	}
   115  	log.Printf("Reading %v", input)
   116  
   117  	success := false
   118  	defer func() {
   119  		if success {
   120  			input.Close()
   121  		} else {
   122  			util.CloseWhileHandlingError(err, input)
   123  		}
   124  	}()
   125  
   126  	_, err = codec.CheckHeader(input,
   127  		LUCENE42_FI_CODEC_NAME,
   128  		LUCENE42_FI_FORMAT_START,
   129  		LUCENE42_FI_FORMAT_CURRENT)
   130  	if err != nil {
   131  		return fi, err
   132  	}
   133  
   134  	size, err := input.ReadVInt() //read in the size
   135  	if err != nil {
   136  		return fi, err
   137  	}
   138  	log.Printf("Found %v FieldInfos.", size)
   139  
   140  	infos := make([]*FieldInfo, size)
   141  	for i, _ := range infos {
   142  		name, err := input.ReadString()
   143  		if err != nil {
   144  			return fi, err
   145  		}
   146  		fieldNumber, err := input.ReadVInt()
   147  		if err != nil {
   148  			return fi, err
   149  		}
   150  		bits, err := input.ReadByte()
   151  		if err != nil {
   152  			return fi, err
   153  		}
   154  		isIndexed := (bits & LUCENE42_FI_IS_INDEXED) != 0
   155  		storeTermVector := (bits & LUCENE42_FI_STORE_TERMVECTOR) != 0
   156  		omitNorms := (bits & LUCENE42_FI_OMIT_NORMS) != 0
   157  		storePayloads := (bits & LUCENE42_FI_STORE_PAYLOADS) != 0
   158  		var indexOptions IndexOptions
   159  		switch {
   160  		case !isIndexed:
   161  			indexOptions = IndexOptions(0)
   162  		case (bits & LUCENE42_FI_OMIT_TERM_FREQ_AND_POSITIONS) != 0:
   163  			indexOptions = INDEX_OPT_DOCS_ONLY
   164  		case (bits & LUCENE42_FI_OMIT_POSITIONS) != 0:
   165  			indexOptions = INDEX_OPT_DOCS_AND_FREQS
   166  		case (bits & LUCENE42_FI_STORE_OFFSETS_IN_POSTINGS) != 0:
   167  			indexOptions = INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
   168  		default:
   169  			indexOptions = INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS
   170  		}
   171  
   172  		// DV Types are packed in one byte
   173  		val, err := input.ReadByte()
   174  		if err != nil {
   175  			return fi, err
   176  		}
   177  		docValuesType, err := getDocValuesType(input, (byte)(val&0x0F))
   178  		if err != nil {
   179  			return fi, err
   180  		}
   181  		normsType, err := getDocValuesType(input, (byte)((uint8(val)>>4)&0x0F))
   182  		if err != nil {
   183  			return fi, err
   184  		}
   185  		attributes, err := input.ReadStringStringMap()
   186  		if err != nil {
   187  			return fi, err
   188  		}
   189  		infos[i] = NewFieldInfo(name, isIndexed, fieldNumber, storeTermVector,
   190  			omitNorms, storePayloads, indexOptions, docValuesType, normsType, -1, attributes)
   191  	}
   192  
   193  	if err = codec.CheckEOF(input); err != nil {
   194  		return fi, err
   195  	}
   196  	fi = NewFieldInfos(infos)
   197  	success = true
   198  	return fi, nil
   199  }
   200  
   201  func getDocValuesType(input store.IndexInput, b byte) (t DocValuesType, err error) {
   202  	switch b {
   203  	case 0:
   204  		return DocValuesType(0), nil
   205  	case 1:
   206  		return DOC_VALUES_TYPE_NUMERIC, nil
   207  	case 2:
   208  		return DOC_VALUES_TYPE_BINARY, nil
   209  	case 3:
   210  		return DOC_VALUES_TYPE_SORTED, nil
   211  	case 4:
   212  		return DOC_VALUES_TYPE_SORTED_SET, nil
   213  	default:
   214  		return DocValuesType(0), errors.New(
   215  			fmt.Sprintf("invalid docvalues byte: %v (resource=%v)", b, input))
   216  	}
   217  }
   218  
   219  // lucene42/Lucene42FieldInfosWriter.java
   220  // var Lucene42FieldInfosWriter = func(dir store.Directory,
   221  // 	segName string, infos FieldInfos, ctx store.IOContext) (err error) {
   222  
   223  // 	fileName := util.SegmentFileName(segName, "", LUCENE42_FI_EXTENSION)
   224  // 	var output store.IndexOutput
   225  // 	output, err = dir.CreateOutput(fileName, ctx)
   226  // 	if err != nil {
   227  // 		return err
   228  // 	}
   229  
   230  // 	var success = false
   231  // 	defer func() {
   232  // 		if success {
   233  // 			err = mergeError(err, output.Close())
   234  // 		} else {
   235  // 			util.CloseWhileSuppressingError(output)
   236  // 		}
   237  // 	}()
   238  
   239  // 	err = codec.WriteHeader(output, LUCENE42_FI_CODEC_NAME, LUCENE42_FI_FORMAT_CURRENT)
   240  // 	if err != nil {
   241  // 		return err
   242  // 	}
   243  // 	err = output.WriteVInt(int32(len(infos.Values)))
   244  // 	if err != nil {
   245  // 		return err
   246  // 	}
   247  // 	for _, fi := range infos.Values {
   248  // 		indexOptions := fi.IndexOptions()
   249  // 		bits := byte(0x0)
   250  // 		if fi.HasVectors() {
   251  // 			bits |= LUCENE42_FI_STORE_TERMVECTOR
   252  // 		}
   253  // 		if fi.OmitsNorms() {
   254  // 			bits |= LUCENE42_FI_OMIT_NORMS
   255  // 		}
   256  // 		if fi.HasPayloads() {
   257  // 			bits |= LUCENE42_FI_STORE_PAYLOADS
   258  // 		}
   259  // 		if fi.IsIndexed() {
   260  // 			bits |= LUCENE42_FI_IS_INDEXED
   261  // 			assert(int(indexOptions) >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS) || !fi.HasPayloads())
   262  // 			switch indexOptions {
   263  // 			case INDEX_OPT_DOCS_ONLY:
   264  // 				bits |= LUCENE42_FI_OMIT_TERM_FREQ_AND_POSITIONS
   265  // 			case INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
   266  // 				bits |= LUCENE42_FI_STORE_OFFSETS_IN_POSTINGS
   267  // 			case INDEX_OPT_DOCS_AND_FREQS:
   268  // 				bits |= LUCENE42_FI_OMIT_POSITIONS
   269  // 			}
   270  // 		}
   271  // 		err = output.WriteString(fi.Name)
   272  // 		if err != nil {
   273  // 			return err
   274  // 		}
   275  // 		err = output.WriteVInt(fi.Number)
   276  // 		if err != nil {
   277  // 			return err
   278  // 		}
   279  // 		err = output.WriteByte(bits)
   280  // 		if err != nil {
   281  // 			return err
   282  // 		}
   283  
   284  // 		// pack the DV types in one byte
   285  // 		dv := docValuesByte(fi.DocValuesType())
   286  // 		nrm := docValuesByte(fi.NormType())
   287  // 		assert((int(dv)&(^0xF)) == 0 && (int(nrm)&(^0x0F)) == 0)
   288  // 		val := byte(0xFF & ((nrm << 4) | dv))
   289  // 		err = output.WriteByte(val)
   290  // 		if err != nil {
   291  // 			return err
   292  // 		}
   293  // 		err = output.WriteStringStringMap(fi.Attributes())
   294  // 		if err != nil {
   295  // 			return err
   296  // 		}
   297  // 	}
   298  // 	success = true
   299  // 	return nil
   300  // }
   301  
   302  // func docValuesByte(typ DocValuesType) byte {
   303  // 	n := byte(typ)
   304  // 	assert(n >= 0 && n <= 4)
   305  // 	return n
   306  // }