github.com/fraugster/parquet-go@v0.12.0/page_dict.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "hash/crc32" 9 "io" 10 11 "github.com/fraugster/parquet-go/parquet" 12 ) 13 14 // dictionaryPage is not a real data page, so there is no need to implement the page interface 15 type dictPageReader struct { 16 values []interface{} 17 enc valuesDecoder 18 ph *parquet.PageHeader 19 20 numValues int32 21 validateCRC bool 22 23 alloc *allocTracker 24 } 25 26 func (dp *dictPageReader) init(dict valuesDecoder) error { 27 if dict == nil { 28 return errors.New("dictionary page without dictionary value encoder") 29 } 30 31 dp.enc = dict 32 return nil 33 } 34 35 func (dp *dictPageReader) read(r io.Reader, ph *parquet.PageHeader, codec parquet.CompressionCodec) error { 36 if ph.DictionaryPageHeader == nil { 37 return fmt.Errorf("null DictionaryPageHeader in %+v", ph) 38 } 39 40 if dp.numValues = ph.DictionaryPageHeader.NumValues; dp.numValues < 0 { 41 return fmt.Errorf("negative NumValues in DICTIONARY_PAGE: %d", dp.numValues) 42 } 43 44 if ph.DictionaryPageHeader.Encoding != parquet.Encoding_PLAIN && ph.DictionaryPageHeader.Encoding != parquet.Encoding_PLAIN_DICTIONARY { 45 return fmt.Errorf("only Encoding_PLAIN and Encoding_PLAIN_DICTIONARY is supported for dict values encoder") 46 } 47 48 dp.ph = ph 49 50 dictPageBlock, err := readPageBlock(r, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), dp.validateCRC, ph.Crc, dp.alloc) 51 if err != nil { 52 return err 53 } 54 55 reader, err := newBlockReader(dictPageBlock, codec, ph.GetCompressedPageSize(), ph.GetUncompressedPageSize(), dp.alloc) 56 if err != nil { 57 return err 58 } 59 60 dp.values = make([]interface{}, dp.numValues) 61 62 if err := dp.enc.init(reader); err != nil { 63 return err 64 } 65 66 // no error is accepted here, even EOF 67 if n, err := dp.enc.decodeValues(dp.values); err != nil { 68 return fmt.Errorf("expected %d values, read %d values: %w", dp.numValues, n, err) 69 } 70 71 return nil 72 } 73 74 type dictPageWriter struct { 75 sch *schema 76 col *Column 77 codec parquet.CompressionCodec 78 dictValues []interface{} 79 } 80 81 func (dp *dictPageWriter) init(sch *schema, col *Column, codec parquet.CompressionCodec, dictValues []interface{}) error { 82 dp.sch = sch 83 dp.col = col 84 dp.codec = codec 85 dp.dictValues = dictValues 86 return nil 87 } 88 89 func (dp *dictPageWriter) getHeader(comp, unComp int, crc32Checksum *int32) *parquet.PageHeader { 90 ph := &parquet.PageHeader{ 91 Type: parquet.PageType_DICTIONARY_PAGE, 92 UncompressedPageSize: int32(unComp), 93 CompressedPageSize: int32(comp), 94 Crc: crc32Checksum, 95 DictionaryPageHeader: &parquet.DictionaryPageHeader{ 96 NumValues: int32(len(dp.dictValues)), 97 Encoding: parquet.Encoding_PLAIN, // PLAIN_DICTIONARY is deprecated in the Parquet 2.0 specification 98 IsSorted: nil, 99 }, 100 } 101 return ph 102 } 103 104 func (dp *dictPageWriter) write(ctx context.Context, w io.Writer) (int, int, error) { 105 // In V1 data page is compressed separately 106 dataBuf := &bytes.Buffer{} 107 108 encoder, err := getDictValuesEncoder(dp.col.Element()) 109 if err != nil { 110 return 0, 0, err 111 } 112 113 err = encodeValue(dataBuf, encoder, dp.dictValues) 114 if err != nil { 115 return 0, 0, err 116 } 117 118 comp, err := compressBlock(dataBuf.Bytes(), dp.codec) 119 if err != nil { 120 return 0, 0, fmt.Errorf("compressing data failed with %s method: %w", dp.codec, err) 121 } 122 compSize, unCompSize := len(comp), len(dataBuf.Bytes()) 123 124 var crc32Checksum *int32 125 if dp.sch.enableCRC { 126 sum := int32(crc32.ChecksumIEEE(comp)) 127 crc32Checksum = &sum 128 } 129 130 header := dp.getHeader(compSize, unCompSize, crc32Checksum) 131 if err := writeThrift(ctx, header, w); err != nil { 132 return 0, 0, err 133 } 134 135 return compSize, unCompSize, writeFull(w, comp) 136 }