github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/encoding.go (about)

     1  package parquet
     2  
     3  import (
     4  	"math/bits"
     5  
     6  	"github.com/segmentio/parquet-go/encoding"
     7  	"github.com/segmentio/parquet-go/encoding/bitpacked"
     8  	"github.com/segmentio/parquet-go/encoding/bytestreamsplit"
     9  	"github.com/segmentio/parquet-go/encoding/delta"
    10  	"github.com/segmentio/parquet-go/encoding/plain"
    11  	"github.com/segmentio/parquet-go/encoding/rle"
    12  	"github.com/segmentio/parquet-go/format"
    13  )
    14  
    15  var (
    16  	// Plain is the default parquet encoding.
    17  	Plain plain.Encoding
    18  
    19  	// RLE is the hybrid bit-pack/run-length parquet encoding.
    20  	RLE rle.Encoding
    21  
    22  	// BitPacked is the deprecated bit-packed encoding for repetition and
    23  	// definition levels.
    24  	BitPacked bitpacked.Encoding
    25  
    26  	// PlainDictionary is the plain dictionary parquet encoding.
    27  	//
    28  	// This encoding should not be used anymore in parquet 2.0 and later,
    29  	// it is implemented for backwards compatibility to support reading
    30  	// files that were encoded with older parquet libraries.
    31  	PlainDictionary plain.DictionaryEncoding
    32  
    33  	// RLEDictionary is the RLE dictionary parquet encoding.
    34  	RLEDictionary rle.DictionaryEncoding
    35  
    36  	// DeltaBinaryPacked is the delta binary packed parquet encoding.
    37  	DeltaBinaryPacked delta.BinaryPackedEncoding
    38  
    39  	// DeltaLengthByteArray is the delta length byte array parquet encoding.
    40  	DeltaLengthByteArray delta.LengthByteArrayEncoding
    41  
    42  	// DeltaByteArray is the delta byte array parquet encoding.
    43  	DeltaByteArray delta.ByteArrayEncoding
    44  
    45  	// ByteStreamSplit is an encoding for floating-point data.
    46  	ByteStreamSplit bytestreamsplit.Encoding
    47  
    48  	// Table indexing the encodings supported by this package.
    49  	encodings = [...]encoding.Encoding{
    50  		format.Plain:                &Plain,
    51  		format.PlainDictionary:      &PlainDictionary,
    52  		format.BitPacked:            &BitPacked,
    53  		format.RLE:                  &RLE,
    54  		format.RLEDictionary:        &RLEDictionary,
    55  		format.DeltaBinaryPacked:    &DeltaBinaryPacked,
    56  		format.DeltaLengthByteArray: &DeltaLengthByteArray,
    57  		format.DeltaByteArray:       &DeltaByteArray,
    58  		format.ByteStreamSplit:      &ByteStreamSplit,
    59  	}
    60  
    61  	// Table indexing RLE encodings for repetition and definition levels of
    62  	// all supported bit widths.
    63  	levelEncodingsRLE = [...]rle.Encoding{
    64  		0: {BitWidth: 1},
    65  		1: {BitWidth: 2},
    66  		2: {BitWidth: 3},
    67  		3: {BitWidth: 4},
    68  		4: {BitWidth: 5},
    69  		5: {BitWidth: 6},
    70  		6: {BitWidth: 7},
    71  		7: {BitWidth: 8},
    72  	}
    73  
    74  	levelEncodingsBitPacked = [...]bitpacked.Encoding{
    75  		0: {BitWidth: 1},
    76  		1: {BitWidth: 2},
    77  		2: {BitWidth: 3},
    78  		3: {BitWidth: 4},
    79  		4: {BitWidth: 5},
    80  		5: {BitWidth: 6},
    81  		6: {BitWidth: 7},
    82  		7: {BitWidth: 8},
    83  	}
    84  )
    85  
    86  func isDictionaryEncoding(encoding encoding.Encoding) bool {
    87  	return isDictionaryFormat(encoding.Encoding())
    88  }
    89  
    90  func isDictionaryFormat(encoding format.Encoding) bool {
    91  	return encoding == format.PlainDictionary || encoding == format.RLEDictionary
    92  }
    93  
    94  // LookupEncoding returns the parquet encoding associated with the given code.
    95  //
    96  // The function never returns nil. If the encoding is not supported,
    97  // encoding.NotSupported is returned.
    98  func LookupEncoding(enc format.Encoding) encoding.Encoding {
    99  	if enc >= 0 && int(enc) < len(encodings) {
   100  		if e := encodings[enc]; e != nil {
   101  			return e
   102  		}
   103  	}
   104  	return encoding.NotSupported{}
   105  }
   106  
   107  func lookupLevelEncoding(enc format.Encoding, max byte) encoding.Encoding {
   108  	i := bits.Len8(max) - 1
   109  	switch enc {
   110  	case format.RLE:
   111  		return &levelEncodingsRLE[i]
   112  	case format.BitPacked:
   113  		return &levelEncodingsBitPacked[i]
   114  	default:
   115  		return encoding.NotSupported{}
   116  	}
   117  }
   118  
   119  func canEncode(e encoding.Encoding, k Kind) bool {
   120  	if isDictionaryEncoding(e) {
   121  		return true
   122  	}
   123  	switch k {
   124  	case Boolean:
   125  		return encoding.CanEncodeBoolean(e)
   126  	case Int32:
   127  		return encoding.CanEncodeInt32(e)
   128  	case Int64:
   129  		return encoding.CanEncodeInt64(e)
   130  	case Int96:
   131  		return encoding.CanEncodeInt96(e)
   132  	case Float:
   133  		return encoding.CanEncodeFloat(e)
   134  	case Double:
   135  		return encoding.CanEncodeDouble(e)
   136  	case ByteArray:
   137  		return encoding.CanEncodeByteArray(e)
   138  	case FixedLenByteArray:
   139  		return encoding.CanEncodeFixedLenByteArray(e)
   140  	default:
   141  		return false
   142  	}
   143  }