github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/encoding.go (about) 1 package parquet 2 3 import ( 4 "math/bits" 5 6 "github.com/segmentio/parquet-go/encoding" 7 "github.com/segmentio/parquet-go/encoding/bitpacked" 8 "github.com/segmentio/parquet-go/encoding/bytestreamsplit" 9 "github.com/segmentio/parquet-go/encoding/delta" 10 "github.com/segmentio/parquet-go/encoding/plain" 11 "github.com/segmentio/parquet-go/encoding/rle" 12 "github.com/segmentio/parquet-go/format" 13 ) 14 15 var ( 16 // Plain is the default parquet encoding. 17 Plain plain.Encoding 18 19 // RLE is the hybrid bit-pack/run-length parquet encoding. 20 RLE rle.Encoding 21 22 // BitPacked is the deprecated bit-packed encoding for repetition and 23 // definition levels. 24 BitPacked bitpacked.Encoding 25 26 // PlainDictionary is the plain dictionary parquet encoding. 27 // 28 // This encoding should not be used anymore in parquet 2.0 and later, 29 // it is implemented for backwards compatibility to support reading 30 // files that were encoded with older parquet libraries. 31 PlainDictionary plain.DictionaryEncoding 32 33 // RLEDictionary is the RLE dictionary parquet encoding. 34 RLEDictionary rle.DictionaryEncoding 35 36 // DeltaBinaryPacked is the delta binary packed parquet encoding. 37 DeltaBinaryPacked delta.BinaryPackedEncoding 38 39 // DeltaLengthByteArray is the delta length byte array parquet encoding. 40 DeltaLengthByteArray delta.LengthByteArrayEncoding 41 42 // DeltaByteArray is the delta byte array parquet encoding. 43 DeltaByteArray delta.ByteArrayEncoding 44 45 // ByteStreamSplit is an encoding for floating-point data. 46 ByteStreamSplit bytestreamsplit.Encoding 47 48 // Table indexing the encodings supported by this package. 49 encodings = [...]encoding.Encoding{ 50 format.Plain: &Plain, 51 format.PlainDictionary: &PlainDictionary, 52 format.BitPacked: &BitPacked, 53 format.RLE: &RLE, 54 format.RLEDictionary: &RLEDictionary, 55 format.DeltaBinaryPacked: &DeltaBinaryPacked, 56 format.DeltaLengthByteArray: &DeltaLengthByteArray, 57 format.DeltaByteArray: &DeltaByteArray, 58 format.ByteStreamSplit: &ByteStreamSplit, 59 } 60 61 // Table indexing RLE encodings for repetition and definition levels of 62 // all supported bit widths. 63 levelEncodingsRLE = [...]rle.Encoding{ 64 0: {BitWidth: 1}, 65 1: {BitWidth: 2}, 66 2: {BitWidth: 3}, 67 3: {BitWidth: 4}, 68 4: {BitWidth: 5}, 69 5: {BitWidth: 6}, 70 6: {BitWidth: 7}, 71 7: {BitWidth: 8}, 72 } 73 74 levelEncodingsBitPacked = [...]bitpacked.Encoding{ 75 0: {BitWidth: 1}, 76 1: {BitWidth: 2}, 77 2: {BitWidth: 3}, 78 3: {BitWidth: 4}, 79 4: {BitWidth: 5}, 80 5: {BitWidth: 6}, 81 6: {BitWidth: 7}, 82 7: {BitWidth: 8}, 83 } 84 ) 85 86 func isDictionaryEncoding(encoding encoding.Encoding) bool { 87 return isDictionaryFormat(encoding.Encoding()) 88 } 89 90 func isDictionaryFormat(encoding format.Encoding) bool { 91 return encoding == format.PlainDictionary || encoding == format.RLEDictionary 92 } 93 94 // LookupEncoding returns the parquet encoding associated with the given code. 95 // 96 // The function never returns nil. If the encoding is not supported, 97 // encoding.NotSupported is returned. 98 func LookupEncoding(enc format.Encoding) encoding.Encoding { 99 if enc >= 0 && int(enc) < len(encodings) { 100 if e := encodings[enc]; e != nil { 101 return e 102 } 103 } 104 return encoding.NotSupported{} 105 } 106 107 func lookupLevelEncoding(enc format.Encoding, max byte) encoding.Encoding { 108 i := bits.Len8(max) - 1 109 switch enc { 110 case format.RLE: 111 return &levelEncodingsRLE[i] 112 case format.BitPacked: 113 return &levelEncodingsBitPacked[i] 114 default: 115 return encoding.NotSupported{} 116 } 117 } 118 119 func canEncode(e encoding.Encoding, k Kind) bool { 120 if isDictionaryEncoding(e) { 121 return true 122 } 123 switch k { 124 case Boolean: 125 return encoding.CanEncodeBoolean(e) 126 case Int32: 127 return encoding.CanEncodeInt32(e) 128 case Int64: 129 return encoding.CanEncodeInt64(e) 130 case Int96: 131 return encoding.CanEncodeInt96(e) 132 case Float: 133 return encoding.CanEncodeFloat(e) 134 case Double: 135 return encoding.CanEncodeDouble(e) 136 case ByteArray: 137 return encoding.CanEncodeByteArray(e) 138 case FixedLenByteArray: 139 return encoding.CanEncodeFixedLenByteArray(e) 140 default: 141 return false 142 } 143 }