github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene42/docValuesFormat.go (about)

     1  package lucene42
     2  
     3  import (
     4  	. "github.com/balzaczyy/golucene/core/codec/spi"
     5  	. "github.com/balzaczyy/golucene/core/index/model"
     6  	"github.com/balzaczyy/golucene/core/util/packed"
     7  )
     8  
     9  // lucene42/Lucene42DocValuesFormat.java
    10  
    11  /*
    12  Lucene 4.2 DocValues format.
    13  
    14  Encodes the four per-document value types (Numeric, Binary, Sorted,
    15  SortedSet) with seven basic strategies.
    16  
    17  - Delta-compressed Numerics: per-document integers written in blocks
    18    of 4096. For each block the minimum value is encoded, and each entry
    19    is a delta from that minimum value.
    20  - Table-compressed Numerics: when the number of unique values is very
    21    small, a lookup table is written instead. Each per-document entry
    22    is instead the ordinal to this table.
    23  - Uncompressed Numerics: when all values would fit into a single byte,
    24    and the  acceptableOverheadRatio would pack values into 8 bits per
    25    value anyway, they are written as absolute values (with no indirection
    26    or packing) for performance.
    27  - GCD-compressed Numerics: when all numbers share a common divisor,
    28    such as dates, the greatest common denominator (GCD) is computed,
    29    and quotients are stored using Delta-compressed Numerics.
    30  - Fixed-width Binary: one large concatenated []byte is written, along
    31    with the fixed length. Each document's value can be addressed by
    32    maxDoc*length.
    33  - Variable-width Binary: one large concatenated []byte is written,
    34    along  with end addresses for each document. The addresses are written
    35    in blocks of 4096, with the current absolute start for the block,
    36    and the average (expected) delta per entry. For each document the
    37    deviation from the delta (actual - expected) is written.
    38  - Sorted: an FST mapping deduplicated terms to ordinals is written,
    39    along with the per-document ordinals written using one of the numeric
    40    strategies above.
    41  - SortedSet: an FST mapping deduplicated terms to ordinals is written,
    42    along with the per-document ordinal list written using one of the
    43    binary strategies above.
    44  
    45  Files:
    46  
    47  1. .dvd: DocValues data
    48  2. .dvm: DocValues metadata
    49  
    50  ###### 1. dvm
    51  
    52  The DocValues metadata or .dvm files.
    53  
    54  The DocValues field, this stores metadata, such as the offset into the
    55  DocValues data (.dvd)
    56  
    57  DocValues metadata (.dvm) --> Header, <FieldNumber, EntryType, Entry>^NumFields
    58  
    59  - Entry --> NumericEntry | BinaryEntry | SortedEntry
    60  - NumericEntry --> DataOffset, CompressionType, packedVersion
    61  - BinaryEntry --> DataOffset, DataLength, MinLength, MaxLength, packedVersion, BlockSize?
    62  - SortedEntry --> DataOffset, ValueCount
    63  - FieldNumber, PackedVersion, MinLength, MaxLength, BlockSize, ValudCount --> VInt
    64  - DataOffset, DataLength --> int64
    65  - EntryType, CompressionType --> byte
    66  - Header --> CodecHeader
    67  
    68  Sorted fields have two entries: a Sortedentry with the FST metadata,
    69  and an ordinary NumericEntry for the document-to-ord metadata.
    70  
    71  SortedSet fields have two entries: a SortedEntry with the FST metadata,
    72  and an ordinary BinaryEntry for the document-to-ord-list meatadata.
    73  
    74  FieldNumber of -1 indicates the end of metadata.
    75  
    76  EntryType is a 0 (NumericEntry), 1 (BinaryEntry), or 2 (SortedEntry)
    77  
    78  DataOffset is the pointer to the start of the dta in the DocValues
    79  data (.dvd)
    80  
    81  CompressionType indicates how Numeric values will be compressed:
    82  
    83  - 0 --> delta-compressed. For each block of 4096 integers, every integer
    84    is delta-encoded from the minimum value within the block.
    85  - 1 --> table-compressed. When the number of unique numeric values is
    86    small and it would save space, a lookup table of unique values is
    87    written, followed by the ordinal for each document.
    88  - 2 --> uncompressed. When the acceptableOverHeadratio parameter would
    89    upgrade the number of bits required to 8, and all values fit in a
    90    byte, these are written as absolute binary values for performance.
    91  - 3 --> gcd-compressed. When all integers share a common divisor, only
    92    quotients are stored using blocks of delta-encoded ints.
    93  
    94  
    95  MinLength and MaxLength represent the min and max []byte value lengths
    96  for Binary values. If they are equal, then all values are of a fixed
    97  size, and can be addressed as DataOffset + (docID * length). Otherwise,
    98  the binary values are of variable size, and packed integer metadata (
    99  PackedVersion, BlockSize) is written for addresses.
   100  
   101  ###### 2. dvd
   102  
   103  The DocVlaues data or .dvd file.
   104  
   105  For Docvalues field, this stores the actual per-document data (the
   106  heavy-lifting)
   107  
   108  DocValues data (.dvd) --> Header, <NumericData | BinaryData | SortedData>^NumFields
   109  
   110  - NumericData --> DeltaCOmpressedNumerics | TableCompressedNumerics |
   111    UncompressedNumerics | GCDCOmpressedNumerics
   112  - BinaryData --> byte^DataLength, Addresses
   113  - Sorteddata --> FST<int64>
   114  - DeltaCompressedNumerics --> BlockPackedInts(blockSize=4096)
   115  - TableCompressedNumerics --> TableSize, int64^TableSize, PackedInts
   116  - UncompressedNumerics --> byte^maxdoc
   117  - Addresses --> MonotonicBlockpackedInts(blockSize=4096)
   118  
   119  
   120  SortedSet entries store the list of ordinals in their BinaryData as a
   121  sequences of increasing VLongs, delta-encoded.
   122  
   123  Limitations:
   124  - Binary doc values can be at most MAX_BINARY_FIELD_LENGTH in length.
   125  */
   126  type Lucene42DocValuesFormat struct {
   127  	AcceptableOverheadRatio float32
   128  }
   129  
   130  func NewLucene42DocValuesFormat() *Lucene42DocValuesFormat {
   131  	return &Lucene42DocValuesFormat{packed.PackedInts.DEFAULT}
   132  }
   133  
   134  func (f *Lucene42DocValuesFormat) Name() string {
   135  	return "Lucene42"
   136  }
   137  
   138  func (f *Lucene42DocValuesFormat) FieldsConsumer(state *SegmentWriteState) (w DocValuesConsumer, err error) {
   139  	panic("this codec can only be used for reading")
   140  }
   141  
   142  func (f *Lucene42DocValuesFormat) FieldsProducer(state SegmentReadState) (r DocValuesProducer, err error) {
   143  	return newLucene42DocValuesProducer(state,
   144  		LUCENE42_DV_DATA_CODEC, LUCENE42_DV_DATA_EXTENSION,
   145  		LUCENE42_DV_METADATA_CODEC, LUCENE42_DV_METADATA_EXTENSION)
   146  }