github.com/fraugster/parquet-go@v0.12.0/page_dict.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"hash/crc32"
     9  	"io"
    10  
    11  	"github.com/fraugster/parquet-go/parquet"
    12  )
    13  
    14  // dictionaryPage is not a real data page, so there is no need to implement the page interface
    15  type dictPageReader struct {
    16  	values []interface{}
    17  	enc    valuesDecoder
    18  	ph     *parquet.PageHeader
    19  
    20  	numValues   int32
    21  	validateCRC bool
    22  
    23  	alloc *allocTracker
    24  }
    25  
    26  func (dp *dictPageReader) init(dict valuesDecoder) error {
    27  	if dict == nil {
    28  		return errors.New("dictionary page without dictionary value encoder")
    29  	}
    30  
    31  	dp.enc = dict
    32  	return nil
    33  }
    34  
    35  func (dp *dictPageReader) read(r io.Reader, ph *parquet.PageHeader, codec parquet.CompressionCodec) error {
    36  	if ph.DictionaryPageHeader == nil {
    37  		return fmt.Errorf("null DictionaryPageHeader in %+v", ph)
    38  	}
    39  
    40  	if dp.numValues = ph.DictionaryPageHeader.NumValues; dp.numValues < 0 {
    41  		return fmt.Errorf("negative NumValues in DICTIONARY_PAGE: %d", dp.numValues)
    42  	}
    43  
    44  	if ph.DictionaryPageHeader.Encoding != parquet.Encoding_PLAIN && ph.DictionaryPageHeader.Encoding != parquet.Encoding_PLAIN_DICTIONARY {
    45  		return fmt.Errorf("only Encoding_PLAIN and Encoding_PLAIN_DICTIONARY is supported for dict values encoder")
    46  	}
    47  
    48  	dp.ph = ph
    49  
    50  	dictPageBlock, err := readPageBlock(r, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), dp.validateCRC, ph.Crc, dp.alloc)
    51  	if err != nil {
    52  		return err
    53  	}
    54  
    55  	reader, err := newBlockReader(dictPageBlock, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), dp.alloc)
    56  	if err != nil {
    57  		return err
    58  	}
    59  
    60  	dp.values = make([]interface{}, dp.numValues)
    61  
    62  	if err := dp.enc.init(reader); err != nil {
    63  		return err
    64  	}
    65  
    66  	// no error is accepted here, even EOF
    67  	if n, err := dp.enc.decodeValues(dp.values); err != nil {
    68  		return fmt.Errorf("expected %d values, read %d values: %w", dp.numValues, n, err)
    69  	}
    70  
    71  	return nil
    72  }
    73  
    74  type dictPageWriter struct {
    75  	sch        *schema
    76  	col        *Column
    77  	codec      parquet.CompressionCodec
    78  	dictValues []interface{}
    79  }
    80  
    81  func (dp *dictPageWriter) init(sch *schema, col *Column, codec parquet.CompressionCodec, dictValues []interface{}) error {
    82  	dp.sch = sch
    83  	dp.col = col
    84  	dp.codec = codec
    85  	dp.dictValues = dictValues
    86  	return nil
    87  }
    88  
    89  func (dp *dictPageWriter) getHeader(comp, unComp int, crc32Checksum *int32) *parquet.PageHeader {
    90  	ph := &parquet.PageHeader{
    91  		Type:                 parquet.PageType_DICTIONARY_PAGE,
    92  		UncompressedPageSize: int32(unComp),
    93  		CompressedPageSize:   int32(comp),
    94  		Crc:                  crc32Checksum,
    95  		DictionaryPageHeader: &parquet.DictionaryPageHeader{
    96  			NumValues: int32(len(dp.dictValues)),
    97  			Encoding:  parquet.Encoding_PLAIN, // PLAIN_DICTIONARY is deprecated in the Parquet 2.0 specification
    98  			IsSorted:  nil,
    99  		},
   100  	}
   101  	return ph
   102  }
   103  
   104  func (dp *dictPageWriter) write(ctx context.Context, w io.Writer) (int, int, error) {
   105  	// In V1 data page is compressed separately
   106  	dataBuf := &bytes.Buffer{}
   107  
   108  	encoder, err := getDictValuesEncoder(dp.col.Element())
   109  	if err != nil {
   110  		return 0, 0, err
   111  	}
   112  
   113  	err = encodeValue(dataBuf, encoder, dp.dictValues)
   114  	if err != nil {
   115  		return 0, 0, err
   116  	}
   117  
   118  	comp, err := compressBlock(dataBuf.Bytes(), dp.codec)
   119  	if err != nil {
   120  		return 0, 0, fmt.Errorf("compressing data failed with %s method: %w", dp.codec, err)
   121  	}
   122  	compSize, unCompSize := len(comp), len(dataBuf.Bytes())
   123  
   124  	var crc32Checksum *int32
   125  	if dp.sch.enableCRC {
   126  		sum := int32(crc32.ChecksumIEEE(comp))
   127  		crc32Checksum = &sum
   128  	}
   129  
   130  	header := dp.getHeader(compSize, unCompSize, crc32Checksum)
   131  	if err := writeThrift(ctx, header, w); err != nil {
   132  		return 0, 0, err
   133  	}
   134  
   135  	return compSize, unCompSize, writeFull(w, comp)
   136  }