github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene42/termVectors.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene42/termVectors.go (about)

     1  package lucene42
     2  
     3  import (
     4  	"github.com/balzaczyy/golucene/core/codec/compressing"
     5  )
     6  
     7  // lucene42/Lucene42TermVectorsFormat.java
     8  
     9  /*
    10  Lucene 4.2 term vectors format.
    11  
    12  Very similarly to Lucene41StoredFieldsFormat, this format is based on
    13  compressed chunks of data, with document-level granularity so that a
    14  document can never span across distinct chunks. Moreover, data is
    15  made as compact as possible:
    16  
    17  - textual data is compressedusing the very light LZ4 compression
    18  algorithm,
    19  - binary data is written using fixed-size blocks of packed ints.
    20  
    21  Term vectors are stored using two files
    22  
    23  - a data file where terms, frequencies, positions, offsets and
    24  payloads are stored,
    25  - an index file, loaded into memory, used to locate specific
    26  documents in the data file.
    27  
    28  Looking up term vectors for any document requires at most 1 disk seek.
    29  
    30  File formats
    31  
    32  1. vector_data
    33  
    34  A vector data file (extension .tvd). This file stores terms,
    35  frequencies, positions, offsets and payloads for every document. Upon
    36  writing a new segment, it accumulates data into memory until the
    37  buffer used to store terms and payloads grows beyond 4KB. Then it
    38  flushes all metadata, terms and positions to disk using LZ4
    39  compression for terms and payloads and blocks of packed ints for
    40  positions
    41  
    42  Here is more detailed description of the field data file format:
    43  
    44  - VectorData (.tvd) --> <Header>, PackedIntsVersion, ChunkSize, <Chunk>^ChunkCount
    45  - Header --> CodecHeader
    46  - PackedIntsVersion --> PackedInts.CURRENT_VERSION as a VInt
    47  - ChunkSize is the number of bytes of terms to accumulate before
    48    flusing, as a VInt
    49  - ChunkCount is not known in advance and is the number of chunks
    50    necessary to store all document of the segment
    51  - Chunk --> DocBase, ChunkDocs, <NumFields>, <FieldNums>, <FieldNumOffs>, <Flags>,
    52    <NumTerms>, <TermLengths>, <TermFreqs>, <Position>, <StartOffsets>, <Lengths>,
    53    <PayloadLengths>, <TermAndPayloads>
    54  - DocBase is the ID of the first doc of the chunk as a VInt
    55  - ChunkDocs is the number of documents in the chunk
    56  - NumFields --> DocNumFields^ChunkDocs
    57  - DocNUmFields is the number of fields for each doc, written as a
    58    VInt if ChunkDocs==1 and as a PackedInts array otherwise
    59  - FieldNums --> FieldNumDelta^TotalFields, as a PackedInts array
    60  - FieldNumOff is the offset of the field number in FieldNums
    61  - TotalFields is the total number of fields (sum of the values of NumFields)
    62  - Flags --> Bit <FieldFlags>
    63  - Bit is a single bit which when true means that fields have the same
    64    options for every document in the chunk
    65  - FieldFlags --> if Bit==1: Flag^TotalDistinctFields else Flag^TotalFields
    66  - Flag: a 3-bits int where:
    67    - the first bit means that the field has positions
    68    - the second bit means that the field has offsets
    69    - the third bitmeans that the field has payloads
    70  - NumTerms --> FieldNumTerms^TotalFields
    71  - FieldNumTerms: the numer of terms for each field, using blocks of 64 packed ints
    72  - TermLengths --> PrefixLength^TotalTerms SuffixLength^TotalTerms
    73  - TotalTerms: total number of terms (sum of NumTerms)
    74  - SuffixLength: length of the term of a field, the common prefix with
    75    the previous term otherwise using blocks of 64 packed ints
    76  - TermFreqs --> TermFreqMinus1^TotalTerms
    77  - TermFreqMinus1: (frequency - 1) for each term using blocks of 64 packed ints
    78  - Positions --> PositionDelta^TotalPositions
    79  - TotalPositions is the sum of frequencies of terms of all fields that have positions
    80  - PositionDelta: the absolute position fo rthe first position of a
    81    term, and the difference with the previous positions for following
    82    positions using blocks of 64 packed ints
    83  - StartOffsets --> (AvgCharsPerTerm^TotalDistinctFields) StartOffsetDelta^TotalOffsets
    84  - TotalOffsets is the sum of frequencies of terms of all fields tha thave offsets
    85  - AvgCharsPerTerm: average number of chars per term, encoded as a
    86    float32 on 4 bytes. They are not present if no field has both
    87    positions and offsets enabled.
    88  - StartOffsetDelta: (startOffset - previousStartOffset - AvgCharsPerTerm
    89    * PositionDelta). previousStartOffset is 0 for the first offset and
    90    AvgCharsPerTerm is 0 if the field has no ositions using blocks of
    91    64 pakced ints
    92  - Lengths --> LengthMinusTermLength^TotalOffsets
    93  - LengthMinusTermLength: (endOffset - startOffset - termLenght) using blocks of 64 packed ints
    94  - PayloadLengths --> PayloadLength^TotalPayloads
    95  - TotalPayloads is the sum of frequencies of terms of all fields that have payloads
    96  - PayloadLength is the payload length encoded using blocks of 64 packed ints
    97  - TermAndPayloads --> LZ4-compressed representation of <FieldTermsAndPayLoads>^TotalFields
    98  - FieldTermsAndPayLoads --> Terms (Payloads)
    99  - Terms: term bytes
   100  - Payloads: payload bytes (if the field has payloads)
   101  
   102  2. vector_index
   103  
   104  An index file (extension .tvx).
   105  
   106  - VectorIndex (.tvx) --> <Header>, <ChunkIndex>
   107  - Header --> CodecHeader
   108  - ChunkIndex: See CompressingStoredFieldsIndexWriter
   109  */
   110  type Lucene42TermVectorsFormat struct {
   111  	*compressing.CompressingTermVectorsFormat
   112  }
   113  
   114  func NewLucene42TermVectorsFormat() *Lucene42TermVectorsFormat {
   115  	return &Lucene42TermVectorsFormat{
   116  		compressing.NewCompressingTermVectorsFormat("Lucene41StoredFields", "", compressing.COMPRESSION_MODE_FAST, 1<<12),
   117  	}
   118  }