github.com/fraugster/parquet-go@v0.12.0/chunk_reader.go (about) 1 package goparquet 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "hash/crc32" 8 "io" 9 "math/bits" 10 11 "github.com/fraugster/parquet-go/parquet" 12 ) 13 14 type getValueDecoderFn func(parquet.Encoding) (valuesDecoder, error) 15 type getLevelDecoder func(parquet.Encoding) (levelDecoder, error) 16 17 func getDictValuesDecoder(typ *parquet.SchemaElement) (valuesDecoder, error) { 18 switch *typ.Type { 19 case parquet.Type_BYTE_ARRAY: 20 return &byteArrayPlainDecoder{}, nil 21 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 22 if typ.TypeLength == nil { 23 return nil, fmt.Errorf("type %s with nil type len", typ) 24 } 25 return &byteArrayPlainDecoder{length: int(*typ.TypeLength)}, nil 26 case parquet.Type_FLOAT: 27 return &floatPlainDecoder{}, nil 28 case parquet.Type_DOUBLE: 29 return &doublePlainDecoder{}, nil 30 case parquet.Type_INT32: 31 return &int32PlainDecoder{}, nil 32 case parquet.Type_INT64: 33 return &int64PlainDecoder{}, nil 34 case parquet.Type_INT96: 35 return &int96PlainDecoder{}, nil 36 } 37 38 return nil, fmt.Errorf("type %s is not supported for dict value encoder", typ) 39 } 40 41 func getBooleanValuesDecoder(pageEncoding parquet.Encoding) (valuesDecoder, error) { 42 switch pageEncoding { 43 case parquet.Encoding_PLAIN: 44 return &booleanPlainDecoder{}, nil 45 case parquet.Encoding_RLE: 46 return &booleanRLEDecoder{}, nil 47 default: 48 return nil, fmt.Errorf("unsupported encoding %s for boolean", pageEncoding) 49 } 50 } 51 52 func getByteArrayValuesDecoder(pageEncoding parquet.Encoding, dictValues []interface{}) (valuesDecoder, error) { 53 switch pageEncoding { 54 case parquet.Encoding_PLAIN: 55 return &byteArrayPlainDecoder{}, nil 56 case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: 57 return &byteArrayDeltaLengthDecoder{}, nil 58 case parquet.Encoding_DELTA_BYTE_ARRAY: 59 return &byteArrayDeltaDecoder{}, nil 60 case parquet.Encoding_RLE_DICTIONARY: 61 return &dictDecoder{uniqueValues: dictValues}, nil 62 default: 63 return nil, fmt.Errorf("unsupported encoding %s for binary", pageEncoding) 64 } 65 } 66 67 func getFixedLenByteArrayValuesDecoder(pageEncoding parquet.Encoding, len int, dictValues []interface{}) (valuesDecoder, error) { 68 switch pageEncoding { 69 case parquet.Encoding_PLAIN: 70 return &byteArrayPlainDecoder{length: len}, nil 71 case parquet.Encoding_DELTA_BYTE_ARRAY: 72 return &byteArrayDeltaDecoder{}, nil 73 case parquet.Encoding_RLE_DICTIONARY: 74 return &dictDecoder{uniqueValues: dictValues}, nil 75 default: 76 return nil, fmt.Errorf("unsupported encoding %s for fixed_len_byte_array(%d)", pageEncoding, len) 77 } 78 } 79 80 func getInt32ValuesDecoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesDecoder, error) { 81 switch pageEncoding { 82 case parquet.Encoding_PLAIN: 83 return &int32PlainDecoder{}, nil 84 case parquet.Encoding_DELTA_BINARY_PACKED: 85 return &int32DeltaBPDecoder{}, nil 86 case parquet.Encoding_RLE_DICTIONARY: 87 return &dictDecoder{uniqueValues: dictValues}, nil 88 default: 89 return nil, fmt.Errorf("unsupported encoding %s for int32", pageEncoding) 90 } 91 } 92 93 func getInt64ValuesDecoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesDecoder, error) { 94 switch pageEncoding { 95 case parquet.Encoding_PLAIN: 96 return &int64PlainDecoder{}, nil 97 case parquet.Encoding_DELTA_BINARY_PACKED: 98 return &int64DeltaBPDecoder{}, nil 99 case parquet.Encoding_RLE_DICTIONARY: 100 return &dictDecoder{uniqueValues: dictValues}, nil 101 default: 102 return nil, fmt.Errorf("unsupported encoding %s for int64", pageEncoding) 103 } 104 } 105 106 func getValuesDecoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesDecoder, error) { 107 // Change the deprecated value 108 if pageEncoding == parquet.Encoding_PLAIN_DICTIONARY { 109 pageEncoding = parquet.Encoding_RLE_DICTIONARY 110 } 111 112 switch *typ.Type { 113 case parquet.Type_BOOLEAN: 114 return getBooleanValuesDecoder(pageEncoding) 115 116 case parquet.Type_BYTE_ARRAY: 117 return getByteArrayValuesDecoder(pageEncoding, dictValues) 118 119 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 120 if typ.TypeLength == nil { 121 return nil, fmt.Errorf("type %s with nil type len", typ.Type) 122 } 123 return getFixedLenByteArrayValuesDecoder(pageEncoding, int(*typ.TypeLength), dictValues) 124 case parquet.Type_FLOAT: 125 switch pageEncoding { 126 case parquet.Encoding_PLAIN: 127 return &floatPlainDecoder{}, nil 128 case parquet.Encoding_RLE_DICTIONARY: 129 return &dictDecoder{uniqueValues: dictValues}, nil 130 } 131 132 case parquet.Type_DOUBLE: 133 switch pageEncoding { 134 case parquet.Encoding_PLAIN: 135 return &doublePlainDecoder{}, nil 136 case parquet.Encoding_RLE_DICTIONARY: 137 return &dictDecoder{uniqueValues: dictValues}, nil 138 } 139 140 case parquet.Type_INT32: 141 return getInt32ValuesDecoder(pageEncoding, typ, dictValues) 142 143 case parquet.Type_INT64: 144 return getInt64ValuesDecoder(pageEncoding, typ, dictValues) 145 146 case parquet.Type_INT96: 147 switch pageEncoding { 148 case parquet.Encoding_PLAIN: 149 return &int96PlainDecoder{}, nil 150 case parquet.Encoding_RLE_DICTIONARY: 151 return &dictDecoder{uniqueValues: dictValues}, nil 152 } 153 154 default: 155 return nil, fmt.Errorf("unsupported type %s", typ.Type) 156 } 157 158 return nil, fmt.Errorf("unsupported encoding %s for %s type", pageEncoding, typ.Type) 159 } 160 161 func readPageBlock(r io.Reader, codec parquet.CompressionCodec, compressedSize int32, uncompressedSize int32, validateCRC bool, crc *int32, alloc *allocTracker) ([]byte, error) { 162 if compressedSize < 0 || uncompressedSize < 0 { 163 return nil, errors.New("invalid page data size") 164 } 165 166 alloc.test(uint64(compressedSize)) 167 dataPageBlock, err := io.ReadAll(io.LimitReader(r, int64(compressedSize))) 168 if err != nil { 169 return nil, fmt.Errorf("read failed: %w", err) 170 } 171 alloc.register(dataPageBlock, uint64(len(dataPageBlock))) 172 173 if validateCRC && crc != nil { 174 if sum := crc32.ChecksumIEEE(dataPageBlock); sum != uint32(*crc) { 175 return nil, fmt.Errorf("CRC32 check failed: expected CRC32 %x, got %x", sum, uint32(*crc)) 176 } 177 } 178 179 return dataPageBlock, nil 180 } 181 182 func (f *FileReader) readPages(ctx context.Context, r *offsetReader, col *Column, chunkMeta *parquet.ColumnMetaData, dDecoder, rDecoder getLevelDecoder) (pages []pageReader, useDict bool, err error) { 183 var ( 184 dictPage *dictPageReader 185 ) 186 187 for { 188 if chunkMeta.TotalCompressedSize-r.Count() <= 0 { 189 break 190 } 191 ph := &parquet.PageHeader{} 192 if err := readThrift(ctx, ph, r); err != nil { 193 return nil, false, err 194 } 195 196 if ph.Type == parquet.PageType_DICTIONARY_PAGE { 197 if dictPage != nil { 198 return nil, false, errors.New("there should be only one dictionary") 199 } 200 p := &dictPageReader{ 201 alloc: f.allocTracker, 202 validateCRC: f.schemaReader.validateCRC, 203 } 204 de, err := getDictValuesDecoder(col.Element()) 205 if err != nil { 206 return nil, false, err 207 } 208 if err := p.init(de); err != nil { 209 return nil, false, err 210 } 211 212 if err := p.read(r, ph, chunkMeta.Codec); err != nil { 213 return nil, false, err 214 } 215 216 dictPage = p 217 218 // Go to the next data Page 219 // if we have a DictionaryPageOffset we should return to DataPageOffset 220 if chunkMeta.DictionaryPageOffset != nil { 221 if *chunkMeta.DictionaryPageOffset != r.offset { 222 if _, err := r.Seek(chunkMeta.DataPageOffset, io.SeekStart); err != nil { 223 return nil, false, err 224 } 225 } 226 } 227 continue // go to next page 228 } 229 230 var p pageReader 231 switch ph.Type { 232 case parquet.PageType_DATA_PAGE: 233 p = &dataPageReaderV1{ 234 alloc: f.allocTracker, 235 ph: ph, 236 } 237 case parquet.PageType_DATA_PAGE_V2: 238 p = &dataPageReaderV2{ 239 alloc: f.allocTracker, 240 ph: ph, 241 } 242 default: 243 return nil, false, fmt.Errorf("DATA_PAGE or DATA_PAGE_V2 type supported, but was %s", ph.Type) 244 } 245 var dictValue []interface{} 246 if dictPage != nil { 247 dictValue = dictPage.values 248 } 249 var fn = func(typ parquet.Encoding) (valuesDecoder, error) { 250 return getValuesDecoder(typ, col.Element(), clone(dictValue)) 251 } 252 if err := p.init(dDecoder, rDecoder, fn); err != nil { 253 return nil, false, err 254 } 255 256 if err := p.read(r, ph, chunkMeta.Codec, f.schemaReader.validateCRC); err != nil { 257 return nil, false, err 258 } 259 pages = append(pages, p) 260 } 261 262 return pages, dictPage != nil, nil 263 } 264 265 func clone(in []interface{}) []interface{} { 266 out := make([]interface{}, len(in)) 267 copy(out, in) 268 return out 269 } 270 271 func (f *FileReader) skipChunk(col *Column, chunk *parquet.ColumnChunk) error { 272 if chunk.FilePath != nil { 273 return fmt.Errorf("nyi: data is in another file: '%s'", *chunk.FilePath) 274 } 275 276 c := col.Index() 277 // chunk.FileOffset is useless so ChunkMetaData is required here 278 // as we cannot read it from r 279 // see https://issues.apache.org/jira/browse/PARQUET-291 280 if chunk.MetaData == nil { 281 return fmt.Errorf("missing meta data for Column %c", c) 282 } 283 284 if typ := *col.Element().Type; chunk.MetaData.Type != typ { 285 return fmt.Errorf("wrong type in Column chunk metadata, expected %s was %s", 286 typ, chunk.MetaData.Type) 287 } 288 289 offset := chunk.MetaData.DataPageOffset 290 if chunk.MetaData.DictionaryPageOffset != nil { 291 offset = *chunk.MetaData.DictionaryPageOffset 292 } 293 294 offset += chunk.MetaData.TotalCompressedSize 295 _, err := f.reader.Seek(offset, io.SeekStart) 296 return err 297 } 298 299 func (f *FileReader) readChunk(ctx context.Context, col *Column, chunk *parquet.ColumnChunk) (pages []pageReader, useDict bool, err error) { 300 if chunk.FilePath != nil { 301 return nil, false, fmt.Errorf("nyi: data is in another file: '%s'", *chunk.FilePath) 302 } 303 304 c := col.Index() 305 // chunk.FileOffset is useless so ChunkMetaData is required here 306 // as we cannot read it from r 307 // see https://issues.apache.org/jira/browse/PARQUET-291 308 if chunk.MetaData == nil { 309 return nil, false, fmt.Errorf("missing meta data for Column %c", c) 310 } 311 312 if typ := *col.Element().Type; chunk.MetaData.Type != typ { 313 return nil, false, fmt.Errorf("wrong type in Column chunk metadata, expected %s was %s", 314 typ, chunk.MetaData.Type) 315 } 316 317 offset := chunk.MetaData.DataPageOffset 318 if chunk.MetaData.DictionaryPageOffset != nil { 319 offset = *chunk.MetaData.DictionaryPageOffset 320 } 321 // Seek to the beginning of the first Page 322 if _, err := f.reader.Seek(offset, io.SeekStart); err != nil { 323 return nil, false, err 324 } 325 326 reader := &offsetReader{ 327 inner: f.reader, 328 offset: offset, 329 count: 0, 330 } 331 332 rDecoder := func(enc parquet.Encoding) (levelDecoder, error) { 333 if enc != parquet.Encoding_RLE { 334 return nil, fmt.Errorf("%q is not supported for definition and repetition level", enc) 335 } 336 dec := newHybridDecoder(bits.Len16(col.MaxRepetitionLevel())) 337 dec.buffered = true 338 return &levelDecoderWrapper{decoder: dec, max: col.MaxRepetitionLevel()}, nil 339 } 340 341 dDecoder := func(enc parquet.Encoding) (levelDecoder, error) { 342 if enc != parquet.Encoding_RLE { 343 return nil, fmt.Errorf("%q is not supported for definition and repetition level", enc) 344 } 345 dec := newHybridDecoder(bits.Len16(col.MaxDefinitionLevel())) 346 dec.buffered = true 347 return &levelDecoderWrapper{decoder: dec, max: col.MaxDefinitionLevel()}, nil 348 } 349 350 if col.MaxRepetitionLevel() == 0 { 351 rDecoder = func(parquet.Encoding) (levelDecoder, error) { 352 return &levelDecoderWrapper{decoder: constDecoder(0), max: col.MaxRepetitionLevel()}, nil 353 } 354 } 355 356 if col.MaxDefinitionLevel() == 0 { 357 dDecoder = func(parquet.Encoding) (levelDecoder, error) { 358 return &levelDecoderWrapper{decoder: constDecoder(0), max: col.MaxDefinitionLevel()}, nil 359 } 360 } 361 return f.readPages(ctx, reader, col, chunk.MetaData, dDecoder, rDecoder) 362 } 363 364 func readPageData(col *Column, pages []pageReader, useDict bool) error { 365 s := col.getColumnStore() 366 s.pageIdx, s.pages = 0, pages 367 s.useDict = useDict 368 if err := s.readNextPage(); err != nil { 369 return nil 370 } 371 372 return nil 373 } 374 375 func (f *FileReader) readRowGroupData(ctx context.Context) error { 376 rowGroup := f.meta.RowGroups[f.rowGroupPosition-1] 377 dataCols := f.schemaReader.Columns() 378 379 f.schemaReader.resetData() 380 f.schemaReader.setNumRecords(rowGroup.NumRows) 381 for _, c := range dataCols { 382 idx := c.Index() 383 if len(rowGroup.Columns) <= idx { 384 return fmt.Errorf("column index %d is out of bounds", idx) 385 } 386 chunk := rowGroup.Columns[c.Index()] 387 if !f.schemaReader.isSelectedByPath(c.path) { 388 if err := f.skipChunk(c, chunk); err != nil { 389 return err 390 } 391 c.data.skipped = true 392 continue 393 } 394 pages, useDict, err := f.readChunk(ctx, c, chunk) 395 if err != nil { 396 return err 397 } 398 if err := readPageData(c, pages, useDict); err != nil { 399 return err 400 } 401 } 402 403 return nil 404 }