github.com/fraugster/parquet-go@v0.12.0/chunk_reader.go

github.com/fraugster/parquet-go@v0.12.0/chunk_reader.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"hash/crc32"
     8  	"io"
     9  	"math/bits"
    10  
    11  	"github.com/fraugster/parquet-go/parquet"
    12  )
    13  
    14  type getValueDecoderFn func(parquet.Encoding) (valuesDecoder, error)
    15  type getLevelDecoder func(parquet.Encoding) (levelDecoder, error)
    16  
    17  func getDictValuesDecoder(typ *parquet.SchemaElement) (valuesDecoder, error) {
    18  	switch *typ.Type {
    19  	case parquet.Type_BYTE_ARRAY:
    20  		return &byteArrayPlainDecoder{}, nil
    21  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
    22  		if typ.TypeLength == nil {
    23  			return nil, fmt.Errorf("type %s with nil type len", typ)
    24  		}
    25  		return &byteArrayPlainDecoder{length: int(*typ.TypeLength)}, nil
    26  	case parquet.Type_FLOAT:
    27  		return &floatPlainDecoder{}, nil
    28  	case parquet.Type_DOUBLE:
    29  		return &doublePlainDecoder{}, nil
    30  	case parquet.Type_INT32:
    31  		return &int32PlainDecoder{}, nil
    32  	case parquet.Type_INT64:
    33  		return &int64PlainDecoder{}, nil
    34  	case parquet.Type_INT96:
    35  		return &int96PlainDecoder{}, nil
    36  	}
    37  
    38  	return nil, fmt.Errorf("type %s is not supported for dict value encoder", typ)
    39  }
    40  
    41  func getBooleanValuesDecoder(pageEncoding parquet.Encoding) (valuesDecoder, error) {
    42  	switch pageEncoding {
    43  	case parquet.Encoding_PLAIN:
    44  		return &booleanPlainDecoder{}, nil
    45  	case parquet.Encoding_RLE:
    46  		return &booleanRLEDecoder{}, nil
    47  	default:
    48  		return nil, fmt.Errorf("unsupported encoding %s for boolean", pageEncoding)
    49  	}
    50  }
    51  
    52  func getByteArrayValuesDecoder(pageEncoding parquet.Encoding, dictValues []interface{}) (valuesDecoder, error) {
    53  	switch pageEncoding {
    54  	case parquet.Encoding_PLAIN:
    55  		return &byteArrayPlainDecoder{}, nil
    56  	case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
    57  		return &byteArrayDeltaLengthDecoder{}, nil
    58  	case parquet.Encoding_DELTA_BYTE_ARRAY:
    59  		return &byteArrayDeltaDecoder{}, nil
    60  	case parquet.Encoding_RLE_DICTIONARY:
    61  		return &dictDecoder{uniqueValues: dictValues}, nil
    62  	default:
    63  		return nil, fmt.Errorf("unsupported encoding %s for binary", pageEncoding)
    64  	}
    65  }
    66  
    67  func getFixedLenByteArrayValuesDecoder(pageEncoding parquet.Encoding, len int, dictValues []interface{}) (valuesDecoder, error) {
    68  	switch pageEncoding {
    69  	case parquet.Encoding_PLAIN:
    70  		return &byteArrayPlainDecoder{length: len}, nil
    71  	case parquet.Encoding_DELTA_BYTE_ARRAY:
    72  		return &byteArrayDeltaDecoder{}, nil
    73  	case parquet.Encoding_RLE_DICTIONARY:
    74  		return &dictDecoder{uniqueValues: dictValues}, nil
    75  	default:
    76  		return nil, fmt.Errorf("unsupported encoding %s for fixed_len_byte_array(%d)", pageEncoding, len)
    77  	}
    78  }
    79  
    80  func getInt32ValuesDecoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesDecoder, error) {
    81  	switch pageEncoding {
    82  	case parquet.Encoding_PLAIN:
    83  		return &int32PlainDecoder{}, nil
    84  	case parquet.Encoding_DELTA_BINARY_PACKED:
    85  		return &int32DeltaBPDecoder{}, nil
    86  	case parquet.Encoding_RLE_DICTIONARY:
    87  		return &dictDecoder{uniqueValues: dictValues}, nil
    88  	default:
    89  		return nil, fmt.Errorf("unsupported encoding %s for int32", pageEncoding)
    90  	}
    91  }
    92  
    93  func getInt64ValuesDecoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesDecoder, error) {
    94  	switch pageEncoding {
    95  	case parquet.Encoding_PLAIN:
    96  		return &int64PlainDecoder{}, nil
    97  	case parquet.Encoding_DELTA_BINARY_PACKED:
    98  		return &int64DeltaBPDecoder{}, nil
    99  	case parquet.Encoding_RLE_DICTIONARY:
   100  		return &dictDecoder{uniqueValues: dictValues}, nil
   101  	default:
   102  		return nil, fmt.Errorf("unsupported encoding %s for int64", pageEncoding)
   103  	}
   104  }
   105  
   106  func getValuesDecoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesDecoder, error) {
   107  	// Change the deprecated value
   108  	if pageEncoding == parquet.Encoding_PLAIN_DICTIONARY {
   109  		pageEncoding = parquet.Encoding_RLE_DICTIONARY
   110  	}
   111  
   112  	switch *typ.Type {
   113  	case parquet.Type_BOOLEAN:
   114  		return getBooleanValuesDecoder(pageEncoding)
   115  
   116  	case parquet.Type_BYTE_ARRAY:
   117  		return getByteArrayValuesDecoder(pageEncoding, dictValues)
   118  
   119  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   120  		if typ.TypeLength == nil {
   121  			return nil, fmt.Errorf("type %s with nil type len", typ.Type)
   122  		}
   123  		return getFixedLenByteArrayValuesDecoder(pageEncoding, int(*typ.TypeLength), dictValues)
   124  	case parquet.Type_FLOAT:
   125  		switch pageEncoding {
   126  		case parquet.Encoding_PLAIN:
   127  			return &floatPlainDecoder{}, nil
   128  		case parquet.Encoding_RLE_DICTIONARY:
   129  			return &dictDecoder{uniqueValues: dictValues}, nil
   130  		}
   131  
   132  	case parquet.Type_DOUBLE:
   133  		switch pageEncoding {
   134  		case parquet.Encoding_PLAIN:
   135  			return &doublePlainDecoder{}, nil
   136  		case parquet.Encoding_RLE_DICTIONARY:
   137  			return &dictDecoder{uniqueValues: dictValues}, nil
   138  		}
   139  
   140  	case parquet.Type_INT32:
   141  		return getInt32ValuesDecoder(pageEncoding, typ, dictValues)
   142  
   143  	case parquet.Type_INT64:
   144  		return getInt64ValuesDecoder(pageEncoding, typ, dictValues)
   145  
   146  	case parquet.Type_INT96:
   147  		switch pageEncoding {
   148  		case parquet.Encoding_PLAIN:
   149  			return &int96PlainDecoder{}, nil
   150  		case parquet.Encoding_RLE_DICTIONARY:
   151  			return &dictDecoder{uniqueValues: dictValues}, nil
   152  		}
   153  
   154  	default:
   155  		return nil, fmt.Errorf("unsupported type %s", typ.Type)
   156  	}
   157  
   158  	return nil, fmt.Errorf("unsupported encoding %s for %s type", pageEncoding, typ.Type)
   159  }
   160  
   161  func readPageBlock(r io.Reader, codec parquet.CompressionCodec, compressedSize int32, uncompressedSize int32, validateCRC bool, crc *int32, alloc *allocTracker) ([]byte, error) {
   162  	if compressedSize < 0 || uncompressedSize < 0 {
   163  		return nil, errors.New("invalid page data size")
   164  	}
   165  
   166  	alloc.test(uint64(compressedSize))
   167  	dataPageBlock, err := io.ReadAll(io.LimitReader(r, int64(compressedSize)))
   168  	if err != nil {
   169  		return nil, fmt.Errorf("read failed: %w", err)
   170  	}
   171  	alloc.register(dataPageBlock, uint64(len(dataPageBlock)))
   172  
   173  	if validateCRC && crc != nil {
   174  		if sum := crc32.ChecksumIEEE(dataPageBlock); sum != uint32(*crc) {
   175  			return nil, fmt.Errorf("CRC32 check failed: expected CRC32 %x, got %x", sum, uint32(*crc))
   176  		}
   177  	}
   178  
   179  	return dataPageBlock, nil
   180  }
   181  
   182  func (f *FileReader) readPages(ctx context.Context, r *offsetReader, col *Column, chunkMeta *parquet.ColumnMetaData, dDecoder, rDecoder getLevelDecoder) (pages []pageReader, useDict bool, err error) {
   183  	var (
   184  		dictPage *dictPageReader
   185  	)
   186  
   187  	for {
   188  		if chunkMeta.TotalCompressedSize-r.Count() <= 0 {
   189  			break
   190  		}
   191  		ph := &parquet.PageHeader{}
   192  		if err := readThrift(ctx, ph, r); err != nil {
   193  			return nil, false, err
   194  		}
   195  
   196  		if ph.Type == parquet.PageType_DICTIONARY_PAGE {
   197  			if dictPage != nil {
   198  				return nil, false, errors.New("there should be only one dictionary")
   199  			}
   200  			p := &dictPageReader{
   201  				alloc:       f.allocTracker,
   202  				validateCRC: f.schemaReader.validateCRC,
   203  			}
   204  			de, err := getDictValuesDecoder(col.Element())
   205  			if err != nil {
   206  				return nil, false, err
   207  			}
   208  			if err := p.init(de); err != nil {
   209  				return nil, false, err
   210  			}
   211  
   212  			if err := p.read(r, ph, chunkMeta.Codec); err != nil {
   213  				return nil, false, err
   214  			}
   215  
   216  			dictPage = p
   217  
   218  			// Go to the next data Page
   219  			// if we have a DictionaryPageOffset we should return to DataPageOffset
   220  			if chunkMeta.DictionaryPageOffset != nil {
   221  				if *chunkMeta.DictionaryPageOffset != r.offset {
   222  					if _, err := r.Seek(chunkMeta.DataPageOffset, io.SeekStart); err != nil {
   223  						return nil, false, err
   224  					}
   225  				}
   226  			}
   227  			continue // go to next page
   228  		}
   229  
   230  		var p pageReader
   231  		switch ph.Type {
   232  		case parquet.PageType_DATA_PAGE:
   233  			p = &dataPageReaderV1{
   234  				alloc: f.allocTracker,
   235  				ph:    ph,
   236  			}
   237  		case parquet.PageType_DATA_PAGE_V2:
   238  			p = &dataPageReaderV2{
   239  				alloc: f.allocTracker,
   240  				ph:    ph,
   241  			}
   242  		default:
   243  			return nil, false, fmt.Errorf("DATA_PAGE or DATA_PAGE_V2 type supported, but was %s", ph.Type)
   244  		}
   245  		var dictValue []interface{}
   246  		if dictPage != nil {
   247  			dictValue = dictPage.values
   248  		}
   249  		var fn = func(typ parquet.Encoding) (valuesDecoder, error) {
   250  			return getValuesDecoder(typ, col.Element(), clone(dictValue))
   251  		}
   252  		if err := p.init(dDecoder, rDecoder, fn); err != nil {
   253  			return nil, false, err
   254  		}
   255  
   256  		if err := p.read(r, ph, chunkMeta.Codec, f.schemaReader.validateCRC); err != nil {
   257  			return nil, false, err
   258  		}
   259  		pages = append(pages, p)
   260  	}
   261  
   262  	return pages, dictPage != nil, nil
   263  }
   264  
   265  func clone(in []interface{}) []interface{} {
   266  	out := make([]interface{}, len(in))
   267  	copy(out, in)
   268  	return out
   269  }
   270  
   271  func (f *FileReader) skipChunk(col *Column, chunk *parquet.ColumnChunk) error {
   272  	if chunk.FilePath != nil {
   273  		return fmt.Errorf("nyi: data is in another file: '%s'", *chunk.FilePath)
   274  	}
   275  
   276  	c := col.Index()
   277  	// chunk.FileOffset is useless so ChunkMetaData is required here
   278  	// as we cannot read it from r
   279  	// see https://issues.apache.org/jira/browse/PARQUET-291
   280  	if chunk.MetaData == nil {
   281  		return fmt.Errorf("missing meta data for Column %c", c)
   282  	}
   283  
   284  	if typ := *col.Element().Type; chunk.MetaData.Type != typ {
   285  		return fmt.Errorf("wrong type in Column chunk metadata, expected %s was %s",
   286  			typ, chunk.MetaData.Type)
   287  	}
   288  
   289  	offset := chunk.MetaData.DataPageOffset
   290  	if chunk.MetaData.DictionaryPageOffset != nil {
   291  		offset = *chunk.MetaData.DictionaryPageOffset
   292  	}
   293  
   294  	offset += chunk.MetaData.TotalCompressedSize
   295  	_, err := f.reader.Seek(offset, io.SeekStart)
   296  	return err
   297  }
   298  
   299  func (f *FileReader) readChunk(ctx context.Context, col *Column, chunk *parquet.ColumnChunk) (pages []pageReader, useDict bool, err error) {
   300  	if chunk.FilePath != nil {
   301  		return nil, false, fmt.Errorf("nyi: data is in another file: '%s'", *chunk.FilePath)
   302  	}
   303  
   304  	c := col.Index()
   305  	// chunk.FileOffset is useless so ChunkMetaData is required here
   306  	// as we cannot read it from r
   307  	// see https://issues.apache.org/jira/browse/PARQUET-291
   308  	if chunk.MetaData == nil {
   309  		return nil, false, fmt.Errorf("missing meta data for Column %c", c)
   310  	}
   311  
   312  	if typ := *col.Element().Type; chunk.MetaData.Type != typ {
   313  		return nil, false, fmt.Errorf("wrong type in Column chunk metadata, expected %s was %s",
   314  			typ, chunk.MetaData.Type)
   315  	}
   316  
   317  	offset := chunk.MetaData.DataPageOffset
   318  	if chunk.MetaData.DictionaryPageOffset != nil {
   319  		offset = *chunk.MetaData.DictionaryPageOffset
   320  	}
   321  	// Seek to the beginning of the first Page
   322  	if _, err := f.reader.Seek(offset, io.SeekStart); err != nil {
   323  		return nil, false, err
   324  	}
   325  
   326  	reader := &offsetReader{
   327  		inner:  f.reader,
   328  		offset: offset,
   329  		count:  0,
   330  	}
   331  
   332  	rDecoder := func(enc parquet.Encoding) (levelDecoder, error) {
   333  		if enc != parquet.Encoding_RLE {
   334  			return nil, fmt.Errorf("%q is not supported for definition and repetition level", enc)
   335  		}
   336  		dec := newHybridDecoder(bits.Len16(col.MaxRepetitionLevel()))
   337  		dec.buffered = true
   338  		return &levelDecoderWrapper{decoder: dec, max: col.MaxRepetitionLevel()}, nil
   339  	}
   340  
   341  	dDecoder := func(enc parquet.Encoding) (levelDecoder, error) {
   342  		if enc != parquet.Encoding_RLE {
   343  			return nil, fmt.Errorf("%q is not supported for definition and repetition level", enc)
   344  		}
   345  		dec := newHybridDecoder(bits.Len16(col.MaxDefinitionLevel()))
   346  		dec.buffered = true
   347  		return &levelDecoderWrapper{decoder: dec, max: col.MaxDefinitionLevel()}, nil
   348  	}
   349  
   350  	if col.MaxRepetitionLevel() == 0 {
   351  		rDecoder = func(parquet.Encoding) (levelDecoder, error) {
   352  			return &levelDecoderWrapper{decoder: constDecoder(0), max: col.MaxRepetitionLevel()}, nil
   353  		}
   354  	}
   355  
   356  	if col.MaxDefinitionLevel() == 0 {
   357  		dDecoder = func(parquet.Encoding) (levelDecoder, error) {
   358  			return &levelDecoderWrapper{decoder: constDecoder(0), max: col.MaxDefinitionLevel()}, nil
   359  		}
   360  	}
   361  	return f.readPages(ctx, reader, col, chunk.MetaData, dDecoder, rDecoder)
   362  }
   363  
   364  func readPageData(col *Column, pages []pageReader, useDict bool) error {
   365  	s := col.getColumnStore()
   366  	s.pageIdx, s.pages = 0, pages
   367  	s.useDict = useDict
   368  	if err := s.readNextPage(); err != nil {
   369  		return nil
   370  	}
   371  
   372  	return nil
   373  }
   374  
   375  func (f *FileReader) readRowGroupData(ctx context.Context) error {
   376  	rowGroup := f.meta.RowGroups[f.rowGroupPosition-1]
   377  	dataCols := f.schemaReader.Columns()
   378  
   379  	f.schemaReader.resetData()
   380  	f.schemaReader.setNumRecords(rowGroup.NumRows)
   381  	for _, c := range dataCols {
   382  		idx := c.Index()
   383  		if len(rowGroup.Columns) <= idx {
   384  			return fmt.Errorf("column index %d is out of bounds", idx)
   385  		}
   386  		chunk := rowGroup.Columns[c.Index()]
   387  		if !f.schemaReader.isSelectedByPath(c.path) {
   388  			if err := f.skipChunk(c, chunk); err != nil {
   389  				return err
   390  			}
   391  			c.data.skipped = true
   392  			continue
   393  		}
   394  		pages, useDict, err := f.readChunk(ctx, c, chunk)
   395  		if err != nil {
   396  			return err
   397  		}
   398  		if err := readPageData(c, pages, useDict); err != nil {
   399  			return err
   400  		}
   401  	}
   402  
   403  	return nil
   404  }