github.com/fraugster/parquet-go@v0.12.0/page_v2.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "hash/crc32" 8 "io" 9 "math/bits" 10 11 "github.com/fraugster/parquet-go/parquet" 12 ) 13 14 type dataPageReaderV2 struct { 15 ph *parquet.PageHeader 16 17 valuesCount int32 18 encoding parquet.Encoding 19 valuesDecoder valuesDecoder 20 dDecoder, rDecoder levelDecoder 21 fn getValueDecoderFn 22 position int 23 24 alloc *allocTracker 25 } 26 27 func (dp *dataPageReaderV2) numValues() int32 { 28 return dp.valuesCount 29 } 30 31 func (dp *dataPageReaderV2) readValues(size int) (values []interface{}, dLevel *packedArray, rLevel *packedArray, err error) { 32 if rem := int(dp.valuesCount) - dp.position; rem < size { 33 size = rem 34 } 35 36 if size == 0 { 37 return nil, nil, nil, nil 38 } 39 40 rLevel, _, err = decodePackedArray(dp.rDecoder, size) 41 if err != nil { 42 return nil, nil, nil, fmt.Errorf("read repetition levels failed: %w", err) 43 } 44 45 var notNull int 46 dLevel, notNull, err = decodePackedArray(dp.dDecoder, size) 47 if err != nil { 48 return nil, nil, nil, fmt.Errorf("read definition levels failed: %w", err) 49 } 50 51 val := make([]interface{}, notNull) 52 53 if notNull != 0 { 54 if n, err := dp.valuesDecoder.decodeValues(val); err != nil { 55 return nil, nil, nil, fmt.Errorf("read values from page failed, need %d values but read %d: %w", notNull, n, err) 56 } 57 } 58 dp.position += size 59 return val, dLevel, rLevel, nil 60 } 61 62 func (dp *dataPageReaderV2) init(dDecoder, rDecoder getLevelDecoder, values getValueDecoderFn) error { 63 var err error 64 // Page v2 dose not have any encoding for the levels 65 dp.dDecoder, err = dDecoder(parquet.Encoding_RLE) 66 if err != nil { 67 return err 68 } 69 dp.rDecoder, err = rDecoder(parquet.Encoding_RLE) 70 if err != nil { 71 return err 72 } 73 dp.fn = values 74 dp.position = 0 75 76 return nil 77 } 78 79 func (dp *dataPageReaderV2) read(r io.Reader, ph *parquet.PageHeader, codec parquet.CompressionCodec, validateCRC bool) error { 80 // 1- Uncompressed size is affected by the level lens. 81 // 2- In page V2 the rle size is in header, not in level stream 82 if ph.DataPageHeaderV2 == nil { 83 return fmt.Errorf("null DataPageHeaderV2 in %+v", ph) 84 } 85 86 if dp.valuesCount = ph.DataPageHeaderV2.NumValues; dp.valuesCount < 0 { 87 return fmt.Errorf("negative NumValues in DATA_PAGE_V2: %d", dp.valuesCount) 88 } 89 90 if ph.DataPageHeaderV2.RepetitionLevelsByteLength < 0 { 91 return fmt.Errorf("invalid RepetitionLevelsByteLength %d", ph.DataPageHeaderV2.RepetitionLevelsByteLength) 92 } 93 if ph.DataPageHeaderV2.DefinitionLevelsByteLength < 0 { 94 return fmt.Errorf("invalid DefinitionLevelsByteLength %d", ph.DataPageHeaderV2.DefinitionLevelsByteLength) 95 } 96 dp.encoding = ph.DataPageHeaderV2.Encoding 97 dp.ph = ph 98 99 { // to hide the govet shadow error 100 var err error 101 if dp.valuesDecoder, err = dp.fn(dp.encoding); err != nil { 102 return err 103 } 104 } 105 106 dataPageBlock, err := readPageBlock(r, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), validateCRC, ph.Crc, dp.alloc) 107 if err != nil { 108 return err 109 } 110 111 levelsSize := ph.DataPageHeaderV2.RepetitionLevelsByteLength + ph.DataPageHeaderV2.DefinitionLevelsByteLength 112 113 if ph.DataPageHeaderV2.RepetitionLevelsByteLength > 0 { 114 if err = dp.rDecoder.init(bytes.NewReader(dataPageBlock[:int(ph.DataPageHeaderV2.RepetitionLevelsByteLength)])); err != nil { 115 return fmt.Errorf("read repetition level failed: %w", err) 116 } 117 } 118 119 if ph.DataPageHeaderV2.DefinitionLevelsByteLength > 0 { 120 if err = dp.dDecoder.init(bytes.NewReader(dataPageBlock[int(ph.DataPageHeaderV2.RepetitionLevelsByteLength):levelsSize])); err != nil { 121 return fmt.Errorf("read definition level failed: %w", err) 122 } 123 } 124 125 reader, err := newBlockReader(dataPageBlock[levelsSize:], codec, ph.GetCompressedPageSize()-levelsSize, ph.GetUncompressedPageSize()-levelsSize, dp.alloc) 126 if err != nil { 127 return err 128 } 129 130 return dp.valuesDecoder.init(reader) 131 } 132 133 type dataPageWriterV2 struct { 134 dictValues []interface{} 135 col *Column 136 codec parquet.CompressionCodec 137 page *dataPage 138 139 dictionary bool 140 enableCRC bool 141 } 142 143 func (dp *dataPageWriterV2) init(col *Column, codec parquet.CompressionCodec) error { 144 dp.col = col 145 dp.codec = codec 146 return nil 147 } 148 149 func (dp *dataPageWriterV2) getHeader(comp, unComp, defSize, repSize int, isCompressed bool, pageStats *parquet.Statistics, numRows int32, crc32Checksum *int32) *parquet.PageHeader { 150 enc := dp.col.data.encoding() 151 if dp.dictionary { 152 enc = parquet.Encoding_RLE_DICTIONARY 153 } 154 ph := &parquet.PageHeader{ 155 Type: parquet.PageType_DATA_PAGE_V2, 156 UncompressedPageSize: int32(unComp + defSize + repSize), 157 CompressedPageSize: int32(comp + defSize + repSize), 158 Crc: crc32Checksum, 159 DataPageHeaderV2: &parquet.DataPageHeaderV2{ 160 NumValues: int32(dp.page.numValues) + int32(dp.page.nullValues), 161 NumNulls: int32(dp.page.nullValues), 162 NumRows: numRows, 163 Encoding: enc, 164 DefinitionLevelsByteLength: int32(defSize), 165 RepetitionLevelsByteLength: int32(repSize), 166 IsCompressed: isCompressed, 167 Statistics: pageStats, 168 }, 169 } 170 return ph 171 } 172 173 func (dp *dataPageWriterV2) write(ctx context.Context, w io.Writer) (int, int, error) { 174 rep := &bytes.Buffer{} 175 176 // Only write repetition value higher than zero 177 if dp.col.MaxRepetitionLevel() > 0 { 178 if err := encodeLevelsV2(rep, dp.col.MaxRepetitionLevel(), dp.page.rL); err != nil { 179 return 0, 0, err 180 } 181 } 182 183 def := &bytes.Buffer{} 184 185 // Only write definition level higher than zero 186 if dp.col.MaxDefinitionLevel() > 0 { 187 if err := encodeLevelsV2(def, dp.col.MaxDefinitionLevel(), dp.page.dL); err != nil { 188 return 0, 0, err 189 } 190 } 191 192 dataBuf := &bytes.Buffer{} 193 enc := dp.col.data.encoding() 194 195 if dp.dictionary { 196 enc = parquet.Encoding_RLE_DICTIONARY 197 } 198 199 if dp.dictionary { 200 encoder := newDictEncoder(dataBuf, bits.Len(uint(len(dp.dictValues)))) 201 if err := encoder.encodeIndices(dp.page.indexList); err != nil { 202 return 0, 0, err 203 } 204 if err := encoder.Close(); err != nil { 205 return 0, 0, err 206 } 207 } else { 208 encoder, err := getValuesEncoder(enc, dp.col.Element(), dp.dictValues) 209 if err != nil { 210 return 0, 0, err 211 } 212 213 err = encodeValue(dataBuf, encoder, dp.page.values) 214 if err != nil { 215 return 0, 0, err 216 } 217 } 218 219 comp, err := compressBlock(dataBuf.Bytes(), dp.codec) 220 if err != nil { 221 return 0, 0, fmt.Errorf("compressing data failed with %s method: %w", dp.codec, err) 222 } 223 224 var crc32Checksum *int32 225 if dp.enableCRC { 226 v := int32(crc32.ChecksumIEEE(append(append(rep.Bytes(), def.Bytes()...), comp...))) 227 crc32Checksum = &v 228 } 229 230 compSize, unCompSize := len(comp), len(dataBuf.Bytes()) 231 defLen, repLen := def.Len(), rep.Len() 232 header := dp.getHeader(compSize, unCompSize, defLen, repLen, dp.codec != parquet.CompressionCodec_UNCOMPRESSED, dp.page.stats, int32(dp.page.numRows), crc32Checksum) 233 if err := writeThrift(ctx, header, w); err != nil { 234 return 0, 0, err 235 } 236 237 if err := writeFull(w, rep.Bytes()); err != nil { 238 return 0, 0, err 239 } 240 241 if err := writeFull(w, def.Bytes()); err != nil { 242 return 0, 0, err 243 } 244 245 return compSize + defLen + repLen, unCompSize + defLen + repLen, writeFull(w, comp) 246 } 247 248 func newDataPageV2Writer(useDict bool, dictValues []interface{}, page *dataPage, enableCRC bool) pageWriter { 249 return &dataPageWriterV2{ 250 dictionary: useDict, 251 dictValues: dictValues, 252 page: page, 253 enableCRC: enableCRC, 254 } 255 }