github.com/fraugster/parquet-go@v0.12.0/page_v1.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "hash/crc32" 9 "io" 10 "math/bits" 11 12 "github.com/fraugster/parquet-go/parquet" 13 ) 14 15 type dataPageReaderV1 struct { 16 ph *parquet.PageHeader 17 18 valuesCount int32 19 encoding parquet.Encoding 20 dDecoder, rDecoder levelDecoder 21 valuesDecoder valuesDecoder 22 fn getValueDecoderFn 23 24 position int 25 26 alloc *allocTracker 27 } 28 29 func (dp *dataPageReaderV1) numValues() int32 { 30 return dp.valuesCount 31 } 32 33 func (dp *dataPageReaderV1) readValues(size int) (values []interface{}, dLevel *packedArray, rLevel *packedArray, err error) { 34 if rem := int(dp.valuesCount) - dp.position; rem < size { 35 size = rem 36 } 37 38 if size == 0 { 39 return nil, nil, nil, nil 40 } 41 42 rLevel, _, err = decodePackedArray(dp.rDecoder, size) 43 if err != nil { 44 return nil, nil, nil, fmt.Errorf("read repetition levels failed: %w", err) 45 } 46 47 var notNull int 48 dLevel, notNull, err = decodePackedArray(dp.dDecoder, size) 49 if err != nil { 50 return nil, nil, nil, fmt.Errorf("read definition levels failed: %w", err) 51 } 52 53 val := make([]interface{}, notNull) 54 55 if notNull != 0 { 56 if n, err := dp.valuesDecoder.decodeValues(val); err != nil { 57 return nil, nil, nil, fmt.Errorf("read values from page failed, need %d value read %d: %w", notNull, n, err) 58 } 59 } 60 dp.position += size 61 62 return val, dLevel, rLevel, nil 63 } 64 65 func (dp *dataPageReaderV1) init(dDecoder, rDecoder getLevelDecoder, values getValueDecoderFn) error { 66 if dp.ph.DataPageHeader == nil { 67 return errors.New("page header is missing data page header") 68 } 69 70 var err error 71 dp.rDecoder, err = rDecoder(dp.ph.DataPageHeader.RepetitionLevelEncoding) 72 if err != nil { 73 return err 74 } 75 76 dp.dDecoder, err = dDecoder(dp.ph.DataPageHeader.DefinitionLevelEncoding) 77 if err != nil { 78 return err 79 } 80 81 dp.fn = values 82 dp.position = 0 83 84 return nil 85 } 86 87 func (dp *dataPageReaderV1) read(r io.Reader, ph *parquet.PageHeader, codec parquet.CompressionCodec, validateCRC bool) (err error) { 88 if ph.DataPageHeader == nil { 89 return fmt.Errorf("null DataPageHeader in %+v", ph) 90 } 91 92 if dp.valuesCount = ph.DataPageHeader.NumValues; dp.valuesCount < 0 { 93 return fmt.Errorf("negative NumValues in DATA_PAGE: %d", dp.valuesCount) 94 } 95 96 dataPageBlock, err := readPageBlock(r, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), validateCRC, ph.Crc, dp.alloc) 97 if err != nil { 98 return err 99 } 100 101 reader, err := newBlockReader(dataPageBlock, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), dp.alloc) 102 if err != nil { 103 return err 104 } 105 106 dp.encoding = ph.DataPageHeader.Encoding 107 dp.ph = ph 108 109 if dp.valuesDecoder, err = dp.fn(dp.encoding); err != nil { 110 return err 111 } 112 113 if err := dp.rDecoder.initSize(reader); err != nil { 114 return err 115 } 116 117 if err := dp.dDecoder.initSize(reader); err != nil { 118 return err 119 } 120 121 return dp.valuesDecoder.init(reader) 122 } 123 124 type dataPageWriterV1 struct { 125 dictValues []interface{} 126 col *Column 127 codec parquet.CompressionCodec 128 page *dataPage 129 130 dictionary bool 131 enableCRC bool 132 } 133 134 func (dp *dataPageWriterV1) init(col *Column, codec parquet.CompressionCodec) error { 135 dp.col = col 136 dp.codec = codec 137 return nil 138 } 139 140 func (dp *dataPageWriterV1) getHeader(comp, unComp int, pageStats *parquet.Statistics, crc32Checksum *int32) *parquet.PageHeader { 141 enc := dp.col.data.encoding() 142 if dp.dictionary { 143 enc = parquet.Encoding_RLE_DICTIONARY 144 } 145 ph := &parquet.PageHeader{ 146 Type: parquet.PageType_DATA_PAGE, 147 UncompressedPageSize: int32(unComp), 148 CompressedPageSize: int32(comp), 149 Crc: crc32Checksum, 150 DataPageHeader: &parquet.DataPageHeader{ 151 NumValues: int32(dp.page.numValues) + int32(dp.page.nullValues), 152 Encoding: enc, 153 // Only RLE supported for now, not sure if we need support for more encoding 154 DefinitionLevelEncoding: parquet.Encoding_RLE, 155 RepetitionLevelEncoding: parquet.Encoding_RLE, 156 Statistics: pageStats, 157 }, 158 } 159 return ph 160 } 161 162 func (dp *dataPageWriterV1) write(ctx context.Context, w io.Writer) (int, int, error) { 163 dataBuf := &bytes.Buffer{} 164 // Only write repetition value higher than zero 165 if dp.col.MaxRepetitionLevel() > 0 { 166 if err := encodeLevelsV1(dataBuf, dp.col.MaxRepetitionLevel(), dp.page.rL); err != nil { 167 return 0, 0, err 168 } 169 } 170 171 // Only write definition value higher than zero 172 if dp.col.MaxDefinitionLevel() > 0 { 173 if err := encodeLevelsV1(dataBuf, dp.col.MaxDefinitionLevel(), dp.page.dL); err != nil { 174 return 0, 0, err 175 } 176 } 177 178 enc := dp.col.data.encoding() 179 180 if dp.dictionary { 181 enc = parquet.Encoding_RLE_DICTIONARY 182 } 183 184 if dp.dictionary { 185 encoder := newDictEncoder(dataBuf, bits.Len(uint(len(dp.dictValues)))) 186 if err := encoder.encodeIndices(dp.page.indexList); err != nil { 187 return 0, 0, err 188 } 189 if err := encoder.Close(); err != nil { 190 return 0, 0, err 191 } 192 } else { 193 encoder, err := getValuesEncoder(enc, dp.col.Element(), dp.dictValues) 194 if err != nil { 195 return 0, 0, err 196 } 197 198 err = encodeValue(dataBuf, encoder, dp.page.values) 199 if err != nil { 200 return 0, 0, err 201 } 202 } 203 204 comp, err := compressBlock(dataBuf.Bytes(), dp.codec) 205 if err != nil { 206 return 0, 0, fmt.Errorf("compressing data failed with %s method: %w", dp.codec, err) 207 } 208 compSize, unCompSize := len(comp), len(dataBuf.Bytes()) 209 210 var crc32Checksum *int32 211 if dp.enableCRC { 212 v := int32(crc32.ChecksumIEEE(comp)) 213 crc32Checksum = &v 214 } 215 216 header := dp.getHeader(compSize, unCompSize, dp.page.stats, crc32Checksum) 217 if err := writeThrift(ctx, header, w); err != nil { 218 return 0, 0, err 219 } 220 221 return compSize, unCompSize, writeFull(w, comp) 222 } 223 224 func newDataPageV1Writer(useDict bool, dictValues []interface{}, page *dataPage, enableCRC bool) pageWriter { 225 return &dataPageWriterV1{ 226 dictionary: useDict, 227 dictValues: dictValues, 228 page: page, 229 enableCRC: enableCRC, 230 } 231 }