storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/page.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2018 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package parquet
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"math"
    26  	"strings"
    27  
    28  	"git.apache.org/thrift.git/lib/go/thrift"
    29  
    30  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    31  )
    32  
    33  // getBitWidth - returns bits required to place num e.g.
    34  //
    35  //    num | width
    36  //   -----|-------
    37  //     0  |   0
    38  //     1  |   1
    39  //     2  |   2
    40  //     3  |   2
    41  //     4  |   3
    42  //     5  |   3
    43  //    ... |  ...
    44  //    ... |  ...
    45  //
    46  func getBitWidth(num uint64) (width uint64) {
    47  	for ; num != 0; num >>= 1 {
    48  		width++
    49  	}
    50  
    51  	return width
    52  }
    53  
    54  // getMaxDefLevel - get maximum definition level.
    55  func getMaxDefLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) {
    56  	for i := 1; i <= len(path); i++ {
    57  		name := strings.Join(path[:i], ".")
    58  		if index, ok := nameIndexMap[name]; ok {
    59  			if schemaElements[index].GetRepetitionType() != parquet.FieldRepetitionType_REQUIRED {
    60  				v++
    61  			}
    62  		}
    63  	}
    64  
    65  	return v
    66  }
    67  
    68  // getMaxRepLevel - get maximum repetition level.
    69  func getMaxRepLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) {
    70  	for i := 1; i <= len(path); i++ {
    71  		name := strings.Join(path[:i], ".")
    72  		if index, ok := nameIndexMap[name]; ok {
    73  			if schemaElements[index].GetRepetitionType() == parquet.FieldRepetitionType_REPEATED {
    74  				v++
    75  			}
    76  		}
    77  	}
    78  
    79  	return v
    80  }
    81  
    82  func readPageHeader(reader *thrift.TBufferedTransport) (*parquet.PageHeader, error) {
    83  	pageHeader := parquet.NewPageHeader()
    84  	if err := pageHeader.Read(thrift.NewTCompactProtocol(reader)); err != nil {
    85  		return nil, err
    86  	}
    87  
    88  	return pageHeader, nil
    89  }
    90  
    91  func readPage(
    92  	thriftReader *thrift.TBufferedTransport,
    93  	metadata *parquet.ColumnMetaData,
    94  	columnNameIndexMap map[string]int,
    95  	schemaElements []*parquet.SchemaElement,
    96  ) (page *page, definitionLevels, numRows int64, err error) {
    97  
    98  	pageHeader, err := readPageHeader(thriftReader)
    99  	if err != nil {
   100  		return nil, 0, 0, err
   101  	}
   102  
   103  	read := func() (data []byte, err error) {
   104  		var repLevelsLen, defLevelsLen int32
   105  		var repLevelsBuf, defLevelsBuf []byte
   106  
   107  		if pageHeader.GetType() == parquet.PageType_DATA_PAGE_V2 {
   108  			if pageHeader.DataPageHeaderV2 == nil {
   109  				return nil, errors.New("parquet: Header not set")
   110  			}
   111  			repLevelsLen = pageHeader.DataPageHeaderV2.GetRepetitionLevelsByteLength()
   112  			repLevelsBuf = make([]byte, repLevelsLen)
   113  
   114  			n, err := io.ReadFull(thriftReader, repLevelsBuf)
   115  			if err != nil {
   116  				return nil, err
   117  			}
   118  			if n != int(repLevelsLen) {
   119  				return nil, fmt.Errorf("expected parquet header repetition levels %d, got %d", repLevelsLen, n)
   120  			}
   121  
   122  			defLevelsLen = pageHeader.DataPageHeaderV2.GetDefinitionLevelsByteLength()
   123  			defLevelsBuf = make([]byte, defLevelsLen)
   124  
   125  			n, err = io.ReadFull(thriftReader, defLevelsBuf)
   126  			if err != nil {
   127  				return nil, err
   128  			}
   129  			if n != int(defLevelsLen) {
   130  				return nil, fmt.Errorf("expected parquet header definition levels %d, got %d", defLevelsLen, n)
   131  			}
   132  		}
   133  		dbLen := pageHeader.GetCompressedPageSize() - repLevelsLen - defLevelsLen
   134  		if dbLen < 0 {
   135  			return nil, errors.New("parquet: negative data length")
   136  		}
   137  
   138  		dataBuf := make([]byte, dbLen)
   139  		n, err := io.ReadFull(thriftReader, dataBuf)
   140  		if err != nil {
   141  			return nil, err
   142  		}
   143  		if n != int(dbLen) {
   144  			return nil, fmt.Errorf("expected parquet data buffer %d, got %d", dbLen, n)
   145  		}
   146  
   147  		if dataBuf, err = compressionCodec(metadata.GetCodec()).uncompress(dataBuf); err != nil {
   148  			return nil, err
   149  		}
   150  
   151  		if repLevelsLen == 0 && defLevelsLen == 0 {
   152  			return dataBuf, nil
   153  		}
   154  
   155  		if repLevelsLen > 0 {
   156  			data = append(data, uint32ToBytes(uint32(repLevelsLen))...)
   157  			data = append(data, repLevelsBuf...)
   158  		}
   159  
   160  		if defLevelsLen > 0 {
   161  			data = append(data, uint32ToBytes(uint32(defLevelsLen))...)
   162  			data = append(data, defLevelsBuf...)
   163  		}
   164  
   165  		data = append(data, dataBuf...)
   166  
   167  		return data, nil
   168  	}
   169  
   170  	buf, err := read()
   171  	if err != nil {
   172  		return nil, 0, 0, err
   173  	}
   174  	if metadata == nil {
   175  		return nil, 0, 0, errors.New("parquet: metadata not set")
   176  	}
   177  	path := append([]string{}, metadata.GetPathInSchema()...)
   178  
   179  	bytesReader := bytes.NewReader(buf)
   180  	pageType := pageHeader.GetType()
   181  	switch pageType {
   182  	case parquet.PageType_INDEX_PAGE:
   183  		return nil, 0, 0, fmt.Errorf("page type %v is not supported", parquet.PageType_INDEX_PAGE)
   184  
   185  	case parquet.PageType_DICTIONARY_PAGE:
   186  		page = newDictPage()
   187  		page.Header = pageHeader
   188  		table := new(table)
   189  		table.Path = path
   190  		if pageHeader.DictionaryPageHeader == nil {
   191  			return nil, 0, 0, errors.New("parquet: dictionary not set")
   192  		}
   193  		values, err := readValues(bytesReader, metadata.GetType(),
   194  			uint64(pageHeader.DictionaryPageHeader.GetNumValues()), 0)
   195  		if err != nil {
   196  			return nil, 0, 0, err
   197  		}
   198  		table.Values = getTableValues(values, metadata.GetType())
   199  		page.DataTable = table
   200  
   201  		return page, 0, 0, nil
   202  
   203  	case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2:
   204  		name := strings.Join(path, ".")
   205  
   206  		page = newDataPage()
   207  		page.Header = pageHeader
   208  
   209  		maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, path)
   210  		maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, path)
   211  
   212  		var numValues uint64
   213  		var encodingType parquet.Encoding
   214  
   215  		if pageHeader.GetType() == parquet.PageType_DATA_PAGE {
   216  			if pageHeader.DataPageHeader == nil {
   217  				return nil, 0, 0, errors.New("parquet: Header not set")
   218  			}
   219  			numValues = uint64(pageHeader.DataPageHeader.GetNumValues())
   220  			encodingType = pageHeader.DataPageHeader.GetEncoding()
   221  		} else {
   222  			if pageHeader.DataPageHeaderV2 == nil {
   223  				return nil, 0, 0, errors.New("parquet: Header not set")
   224  			}
   225  			numValues = uint64(pageHeader.DataPageHeaderV2.GetNumValues())
   226  			encodingType = pageHeader.DataPageHeaderV2.GetEncoding()
   227  		}
   228  
   229  		var repetitionLevels []int64
   230  		if maxRepetitionLevel > 0 {
   231  			values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
   232  				-1, numValues, getBitWidth(uint64(maxRepetitionLevel)))
   233  			if err != nil {
   234  				return nil, 0, 0, err
   235  			}
   236  
   237  			if repetitionLevels = values.([]int64); len(repetitionLevels) > int(numValues) && int(numValues) >= 0 {
   238  				repetitionLevels = repetitionLevels[:numValues]
   239  			}
   240  		} else {
   241  			if numValues > math.MaxInt64/8 {
   242  				return nil, 0, 0, errors.New("parquet: numvalues too large")
   243  			}
   244  			repetitionLevels = make([]int64, numValues)
   245  		}
   246  
   247  		var definitionLevels []int64
   248  		if maxDefinitionLevel > 0 {
   249  			values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
   250  				-1, numValues, getBitWidth(uint64(maxDefinitionLevel)))
   251  			if err != nil {
   252  				return nil, 0, 0, err
   253  			}
   254  			if numValues > math.MaxInt64/8 {
   255  				return nil, 0, 0, errors.New("parquet: numvalues too large")
   256  			}
   257  			if definitionLevels = values.([]int64); len(definitionLevels) > int(numValues) {
   258  				definitionLevels = definitionLevels[:numValues]
   259  			}
   260  		} else {
   261  			if numValues > math.MaxInt64/8 {
   262  				return nil, 0, 0, errors.New("parquet: numvalues too large")
   263  			}
   264  			definitionLevels = make([]int64, numValues)
   265  		}
   266  
   267  		var numNulls uint64
   268  		for i := 0; i < len(definitionLevels); i++ {
   269  			if definitionLevels[i] != int64(maxDefinitionLevel) {
   270  				numNulls++
   271  			}
   272  		}
   273  
   274  		var convertedType parquet.ConvertedType = -1
   275  		if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() {
   276  			convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType()
   277  		}
   278  		values, valueType, err := readDataPageValues(bytesReader, encodingType, metadata.GetType(),
   279  			convertedType, uint64(len(definitionLevels))-numNulls,
   280  			uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength()))
   281  		if err != nil {
   282  			return nil, 0, 0, err
   283  		}
   284  		tableValues := getTableValues(values, valueType)
   285  
   286  		table := new(table)
   287  		table.Path = path
   288  		table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType()
   289  		table.MaxRepetitionLevel = int32(maxRepetitionLevel)
   290  		table.MaxDefinitionLevel = int32(maxDefinitionLevel)
   291  		table.Values = make([]interface{}, len(definitionLevels))
   292  		table.RepetitionLevels = make([]int32, len(definitionLevels))
   293  		table.DefinitionLevels = make([]int32, len(definitionLevels))
   294  
   295  		j := 0
   296  		numRows := int64(0)
   297  		for i := 0; i < len(definitionLevels); i++ {
   298  			table.RepetitionLevels[i] = int32(repetitionLevels[i])
   299  			table.DefinitionLevels[i] = int32(definitionLevels[i])
   300  			if int(table.DefinitionLevels[i]) == maxDefinitionLevel {
   301  				table.Values[i] = tableValues[j]
   302  				j++
   303  			}
   304  			if table.RepetitionLevels[i] == 0 {
   305  				numRows++
   306  			}
   307  		}
   308  		page.DataTable = table
   309  
   310  		return page, int64(len(definitionLevels)), numRows, nil
   311  	}
   312  
   313  	return nil, 0, 0, fmt.Errorf("unknown page type %v", pageType)
   314  }
   315  
   316  type page struct {
   317  	Header       *parquet.PageHeader      // Header of a page
   318  	DataTable    *table                   // Table to store values
   319  	RawData      []byte                   // Compressed data of the page, which is written in parquet file
   320  	CompressType parquet.CompressionCodec // Compress type: gzip/snappy/none
   321  	DataType     parquet.Type             // Parquet type of the values in the page
   322  	Path         []string                 // Path in schema(include the root)
   323  	MaxVal       interface{}              // Maximum of the values
   324  	MinVal       interface{}              // Minimum of the values
   325  	PageSize     int32
   326  }
   327  
   328  func newPage() *page {
   329  	return &page{
   330  		Header:   parquet.NewPageHeader(),
   331  		PageSize: defaultPageSize,
   332  	}
   333  }
   334  
   335  func newDictPage() *page {
   336  	page := newPage()
   337  	page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
   338  	return page
   339  }
   340  
   341  func newDataPage() *page {
   342  	page := newPage()
   343  	page.Header.DataPageHeader = parquet.NewDataPageHeader()
   344  	return page
   345  }
   346  
   347  func (page *page) decode(dictPage *page) {
   348  	if dictPage == nil || page == nil || page.Header.DataPageHeader == nil ||
   349  		(page.Header.DataPageHeader.Encoding != parquet.Encoding_RLE_DICTIONARY &&
   350  			page.Header.DataPageHeader.Encoding != parquet.Encoding_PLAIN_DICTIONARY) {
   351  		return
   352  	}
   353  
   354  	for i := 0; i < len(page.DataTable.Values); i++ {
   355  		if page.DataTable.Values[i] != nil {
   356  			index, ok := page.DataTable.Values[i].(int64)
   357  			if !ok || int(index) >= len(dictPage.DataTable.Values) {
   358  				return
   359  			}
   360  			page.DataTable.Values[i] = dictPage.DataTable.Values[index]
   361  		}
   362  	}
   363  }
   364  
   365  // Get RepetitionLevels and Definitions from RawData
   366  func (page *page) getRLDLFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (numValues int64, numRows int64, err error) {
   367  	bytesReader := bytes.NewReader(page.RawData)
   368  
   369  	pageType := page.Header.GetType()
   370  
   371  	var buf []byte
   372  	if pageType == parquet.PageType_DATA_PAGE_V2 {
   373  		var repLevelsLen, defLevelsLen int32
   374  		var repLevelsBuf, defLevelsBuf []byte
   375  		if page.Header.DataPageHeaderV2 == nil {
   376  			return 0, 0, errors.New("parquet: Header not set")
   377  		}
   378  		repLevelsLen = page.Header.DataPageHeaderV2.GetRepetitionLevelsByteLength()
   379  		repLevelsBuf = make([]byte, repLevelsLen)
   380  		if _, err = bytesReader.Read(repLevelsBuf); err != nil {
   381  			return 0, 0, err
   382  		}
   383  
   384  		defLevelsLen = page.Header.DataPageHeaderV2.GetDefinitionLevelsByteLength()
   385  		defLevelsBuf = make([]byte, defLevelsLen)
   386  		if _, err = bytesReader.Read(defLevelsBuf); err != nil {
   387  			return 0, 0, err
   388  		}
   389  
   390  		dataBuf := make([]byte, len(page.RawData)-int(repLevelsLen)-int(defLevelsLen))
   391  		if _, err = bytesReader.Read(dataBuf); err != nil {
   392  			return 0, 0, err
   393  		}
   394  
   395  		if repLevelsLen == 0 && defLevelsLen == 0 {
   396  			buf = dataBuf
   397  		} else {
   398  			if repLevelsLen > 0 {
   399  				buf = append(buf, uint32ToBytes(uint32(repLevelsLen))...)
   400  				buf = append(buf, repLevelsBuf...)
   401  			}
   402  
   403  			if defLevelsLen > 0 {
   404  				buf = append(buf, uint32ToBytes(uint32(defLevelsLen))...)
   405  				buf = append(buf, defLevelsBuf...)
   406  			}
   407  
   408  			buf = append(buf, dataBuf...)
   409  		}
   410  	} else {
   411  		if buf, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil {
   412  			return 0, 0, err
   413  		}
   414  	}
   415  
   416  	bytesReader = bytes.NewReader(buf)
   417  
   418  	switch pageType {
   419  	case parquet.PageType_DICTIONARY_PAGE:
   420  		table := new(table)
   421  		table.Path = page.Path
   422  		page.DataTable = table
   423  		return 0, 0, nil
   424  
   425  	case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2:
   426  		var numValues uint64
   427  		if pageType == parquet.PageType_DATA_PAGE {
   428  			if page.Header.DataPageHeader == nil {
   429  				return 0, 0, errors.New("parquet: Header not set")
   430  			}
   431  			numValues = uint64(page.Header.DataPageHeader.GetNumValues())
   432  		} else {
   433  			if page.Header.DataPageHeaderV2 == nil {
   434  				return 0, 0, errors.New("parquet: Header not set")
   435  			}
   436  			numValues = uint64(page.Header.DataPageHeaderV2.GetNumValues())
   437  		}
   438  
   439  		maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, page.Path)
   440  		maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, page.Path)
   441  
   442  		var repetitionLevels []int64
   443  		if maxRepetitionLevel > 0 {
   444  			values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
   445  				-1, numValues, getBitWidth(uint64(maxRepetitionLevel)))
   446  			if err != nil {
   447  				return 0, 0, err
   448  			}
   449  
   450  			if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues {
   451  				repetitionLevels = repetitionLevels[:numValues]
   452  			}
   453  		} else {
   454  			repetitionLevels = make([]int64, numValues)
   455  		}
   456  
   457  		var definitionLevels []int64
   458  		if maxDefinitionLevel > 0 {
   459  			values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64,
   460  				-1, numValues, getBitWidth(uint64(maxDefinitionLevel)))
   461  			if err != nil {
   462  				return 0, 0, err
   463  			}
   464  			if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues {
   465  				definitionLevels = definitionLevels[:numValues]
   466  			}
   467  		} else {
   468  			definitionLevels = make([]int64, numValues)
   469  		}
   470  
   471  		table := new(table)
   472  		table.Path = page.Path
   473  		name := strings.Join(page.Path, ".")
   474  		table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType()
   475  		table.MaxRepetitionLevel = int32(maxRepetitionLevel)
   476  		table.MaxDefinitionLevel = int32(maxDefinitionLevel)
   477  		table.Values = make([]interface{}, len(definitionLevels))
   478  		table.RepetitionLevels = make([]int32, len(definitionLevels))
   479  		table.DefinitionLevels = make([]int32, len(definitionLevels))
   480  
   481  		numRows := int64(0)
   482  		for i := 0; i < len(definitionLevels); i++ {
   483  			table.RepetitionLevels[i] = int32(repetitionLevels[i])
   484  			table.DefinitionLevels[i] = int32(definitionLevels[i])
   485  			if table.RepetitionLevels[i] == 0 {
   486  				numRows++
   487  			}
   488  		}
   489  		page.DataTable = table
   490  		page.RawData = buf[len(buf)-bytesReader.Len():]
   491  
   492  		return int64(numValues), numRows, nil
   493  	}
   494  
   495  	return 0, 0, fmt.Errorf("Unsupported page type %v", pageType)
   496  }
   497  
   498  func (page *page) getValueFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (err error) {
   499  	pageType := page.Header.GetType()
   500  	switch pageType {
   501  	case parquet.PageType_DICTIONARY_PAGE:
   502  		bytesReader := bytes.NewReader(page.RawData)
   503  		var values interface{}
   504  		if page.Header.DictionaryPageHeader == nil {
   505  			return errors.New("parquet: dictionary not set")
   506  		}
   507  		values, err = readValues(bytesReader, page.DataType,
   508  			uint64(page.Header.DictionaryPageHeader.GetNumValues()), 0)
   509  		if err != nil {
   510  			return err
   511  		}
   512  
   513  		page.DataTable.Values = getTableValues(values, page.DataType)
   514  		return nil
   515  
   516  	case parquet.PageType_DATA_PAGE_V2:
   517  		if page.RawData, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil {
   518  			return err
   519  		}
   520  		fallthrough
   521  	case parquet.PageType_DATA_PAGE:
   522  		encodingType := page.Header.DataPageHeader.GetEncoding()
   523  		bytesReader := bytes.NewReader(page.RawData)
   524  
   525  		var numNulls uint64
   526  		for i := 0; i < len(page.DataTable.DefinitionLevels); i++ {
   527  			if page.DataTable.DefinitionLevels[i] != page.DataTable.MaxDefinitionLevel {
   528  				numNulls++
   529  			}
   530  		}
   531  
   532  		name := strings.Join(page.DataTable.Path, ".")
   533  		var convertedType parquet.ConvertedType = -1
   534  
   535  		if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() {
   536  			convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType()
   537  		}
   538  
   539  		values, _, err := readDataPageValues(bytesReader, encodingType, page.DataType,
   540  			convertedType, uint64(len(page.DataTable.DefinitionLevels))-numNulls,
   541  			uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength()))
   542  		if err != nil {
   543  			return err
   544  		}
   545  
   546  		tableValues := getTableValues(values, page.DataType)
   547  
   548  		j := 0
   549  		for i := 0; i < len(page.DataTable.DefinitionLevels); i++ {
   550  			if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
   551  				page.DataTable.Values[i] = tableValues[j]
   552  				j++
   553  			}
   554  		}
   555  
   556  		page.RawData = []byte{}
   557  		return nil
   558  	}
   559  
   560  	return fmt.Errorf("unsupported page type %v", pageType)
   561  }
   562  
   563  func (page *page) toDataPage(compressType parquet.CompressionCodec) []byte {
   564  	values := []interface{}{}
   565  	for i := range page.DataTable.DefinitionLevels {
   566  		if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
   567  			values = append(values, page.DataTable.Values[i])
   568  		}
   569  	}
   570  	valuesBytes := encodeValues(interfacesToValues(values, page.DataTable.Type), page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth)
   571  
   572  	var defLevelBytes []byte
   573  	if page.DataTable.MaxDefinitionLevel > 0 {
   574  		defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
   575  		for i := range page.DataTable.DefinitionLevels {
   576  			defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
   577  		}
   578  		defLevelBytes = valuesToRLEBitPackedHybridBytes(
   579  			defLevels,
   580  			int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
   581  			parquet.Type_INT64,
   582  		)
   583  	}
   584  
   585  	var repLevelBytes []byte
   586  	if page.DataTable.MaxRepetitionLevel > 0 {
   587  		repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
   588  		for i := range page.DataTable.DefinitionLevels {
   589  			repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
   590  		}
   591  		repLevelBytes = valuesToRLEBitPackedHybridBytes(
   592  			repLevels,
   593  			int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
   594  			parquet.Type_INT64,
   595  		)
   596  	}
   597  
   598  	data := repLevelBytes
   599  	data = append(data, defLevelBytes...)
   600  	data = append(data, valuesBytes...)
   601  
   602  	compressedData, err := compressionCodec(compressType).compress(data)
   603  	if err != nil {
   604  		panic(err)
   605  	}
   606  
   607  	page.Header = parquet.NewPageHeader()
   608  	page.Header.Type = parquet.PageType_DATA_PAGE
   609  	page.Header.CompressedPageSize = int32(len(compressedData))
   610  	page.Header.UncompressedPageSize = int32(len(data))
   611  	page.Header.DataPageHeader = parquet.NewDataPageHeader()
   612  	page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels))
   613  	page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
   614  	page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
   615  	page.Header.DataPageHeader.Encoding = page.DataTable.Encoding
   616  	page.Header.DataPageHeader.Statistics = parquet.NewStatistics()
   617  	if page.MaxVal != nil {
   618  		tmpBuf := valueToBytes(page.MaxVal, page.DataType)
   619  		if page.DataType == parquet.Type_BYTE_ARRAY {
   620  			switch page.DataTable.ConvertedType {
   621  			case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
   622  				tmpBuf = tmpBuf[4:]
   623  			}
   624  		}
   625  		page.Header.DataPageHeader.Statistics.Max = tmpBuf
   626  	}
   627  	if page.MinVal != nil {
   628  		tmpBuf := valueToBytes(page.MinVal, page.DataType)
   629  		if page.DataType == parquet.Type_BYTE_ARRAY {
   630  			switch page.DataTable.ConvertedType {
   631  			case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
   632  				tmpBuf = tmpBuf[4:]
   633  			}
   634  		}
   635  		page.Header.DataPageHeader.Statistics.Min = tmpBuf
   636  	}
   637  
   638  	ts := thrift.NewTSerializer()
   639  	ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
   640  	pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
   641  	if err != nil {
   642  		panic(err)
   643  	}
   644  
   645  	page.RawData = append(pageHeaderBytes, compressedData...)
   646  	return page.RawData
   647  }
   648  
   649  func (page *page) toDataPageV2(compressType parquet.CompressionCodec) []byte {
   650  	values := []interface{}{}
   651  	for i := range page.DataTable.DefinitionLevels {
   652  		if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel {
   653  			values = append(values, page.DataTable.Values[i])
   654  		}
   655  	}
   656  	valuesBytes := encodeValues(values, page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth)
   657  
   658  	var defLevelBytes []byte
   659  	if page.DataTable.MaxDefinitionLevel > 0 {
   660  		defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
   661  		for i := range page.DataTable.DefinitionLevels {
   662  			defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
   663  		}
   664  		defLevelBytes = valuesToRLEBytes(
   665  			defLevels,
   666  			int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
   667  			parquet.Type_INT64,
   668  		)
   669  	}
   670  
   671  	var repLevelBytes []byte
   672  	numRows := int32(0)
   673  	if page.DataTable.MaxRepetitionLevel > 0 {
   674  		repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
   675  		for i := range page.DataTable.DefinitionLevels {
   676  			repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
   677  			if page.DataTable.RepetitionLevels[i] == 0 {
   678  				numRows++
   679  			}
   680  		}
   681  		repLevelBytes = valuesToRLEBytes(
   682  			repLevels,
   683  			int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
   684  			parquet.Type_INT64,
   685  		)
   686  	}
   687  
   688  	compressedData, err := compressionCodec(compressType).compress(valuesBytes)
   689  	if err != nil {
   690  		panic(err)
   691  	}
   692  
   693  	page.Header = parquet.NewPageHeader()
   694  	page.Header.Type = parquet.PageType_DATA_PAGE_V2
   695  	page.Header.CompressedPageSize = int32(len(compressedData) + len(defLevelBytes) + len(repLevelBytes))
   696  	page.Header.UncompressedPageSize = int32(len(valuesBytes) + len(defLevelBytes) + len(repLevelBytes))
   697  	page.Header.DataPageHeaderV2 = parquet.NewDataPageHeaderV2()
   698  	page.Header.DataPageHeaderV2.NumValues = int32(len(page.DataTable.Values))
   699  	page.Header.DataPageHeaderV2.NumNulls = page.Header.DataPageHeaderV2.NumValues - int32(len(values))
   700  	page.Header.DataPageHeaderV2.NumRows = numRows
   701  	page.Header.DataPageHeaderV2.Encoding = page.DataTable.Encoding
   702  	page.Header.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(defLevelBytes))
   703  	page.Header.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(repLevelBytes))
   704  	page.Header.DataPageHeaderV2.IsCompressed = true
   705  
   706  	page.Header.DataPageHeaderV2.Statistics = parquet.NewStatistics()
   707  	if page.MaxVal != nil {
   708  		tmpBuf := valueToBytes(page.MaxVal, page.DataType)
   709  		if page.DataType == parquet.Type_BYTE_ARRAY {
   710  			switch page.DataTable.ConvertedType {
   711  			case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
   712  				tmpBuf = tmpBuf[4:]
   713  			}
   714  		}
   715  		page.Header.DataPageHeaderV2.Statistics.Max = tmpBuf
   716  	}
   717  	if page.MinVal != nil {
   718  		tmpBuf := valueToBytes(page.MinVal, page.DataType)
   719  		if page.DataType == parquet.Type_BYTE_ARRAY {
   720  			switch page.DataTable.ConvertedType {
   721  			case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
   722  				tmpBuf = tmpBuf[4:]
   723  			}
   724  		}
   725  		page.Header.DataPageHeaderV2.Statistics.Min = tmpBuf
   726  	}
   727  
   728  	ts := thrift.NewTSerializer()
   729  	ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
   730  	pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
   731  	if err != nil {
   732  		panic(err)
   733  	}
   734  
   735  	page.RawData = append(pageHeaderBytes, repLevelBytes...)
   736  	page.RawData = append(page.RawData, defLevelBytes...)
   737  	page.RawData = append(page.RawData, compressedData...)
   738  
   739  	return page.RawData
   740  }
   741  
   742  func (page *page) toDictPage(compressType parquet.CompressionCodec, dataType parquet.Type) []byte {
   743  	valuesBytes := valuesToBytes(page.DataTable.Values, dataType)
   744  	compressedData, err := compressionCodec(compressType).compress(valuesBytes)
   745  	if err != nil {
   746  		panic(err)
   747  	}
   748  
   749  	page.Header = parquet.NewPageHeader()
   750  	page.Header.Type = parquet.PageType_DICTIONARY_PAGE
   751  	page.Header.CompressedPageSize = int32(len(compressedData))
   752  	page.Header.UncompressedPageSize = int32(len(valuesBytes))
   753  	page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
   754  	page.Header.DictionaryPageHeader.NumValues = int32(len(page.DataTable.Values))
   755  	page.Header.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN
   756  
   757  	ts := thrift.NewTSerializer()
   758  	ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
   759  	pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
   760  	if err != nil {
   761  		panic(err)
   762  	}
   763  
   764  	page.RawData = append(pageHeaderBytes, compressedData...)
   765  	return page.RawData
   766  }
   767  
   768  func (page *page) toDictDataPage(compressType parquet.CompressionCodec, bitWidth int32) []byte {
   769  	valuesBytes := append([]byte{byte(bitWidth)}, valuesToRLEBytes(page.DataTable.Values, bitWidth, parquet.Type_INT32)...)
   770  
   771  	var defLevelBytes []byte
   772  	if page.DataTable.MaxDefinitionLevel > 0 {
   773  		defLevels := make([]int64, len(page.DataTable.DefinitionLevels))
   774  		for i := range page.DataTable.DefinitionLevels {
   775  			defLevels[i] = int64(page.DataTable.DefinitionLevels[i])
   776  		}
   777  		defLevelBytes = valuesToRLEBitPackedHybridBytes(
   778  			defLevels,
   779  			int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))),
   780  			parquet.Type_INT64,
   781  		)
   782  	}
   783  
   784  	var repLevelBytes []byte
   785  	if page.DataTable.MaxRepetitionLevel > 0 {
   786  		repLevels := make([]int64, len(page.DataTable.DefinitionLevels))
   787  		for i := range page.DataTable.DefinitionLevels {
   788  			repLevels[i] = int64(page.DataTable.RepetitionLevels[i])
   789  		}
   790  		repLevelBytes = valuesToRLEBitPackedHybridBytes(
   791  			repLevels,
   792  			int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))),
   793  			parquet.Type_INT64,
   794  		)
   795  	}
   796  
   797  	data := append(repLevelBytes, defLevelBytes...)
   798  	data = append(data, valuesBytes...)
   799  
   800  	compressedData, err := compressionCodec(compressType).compress(data)
   801  	if err != nil {
   802  		panic(err)
   803  	}
   804  
   805  	page.Header = parquet.NewPageHeader()
   806  	page.Header.Type = parquet.PageType_DATA_PAGE
   807  	page.Header.CompressedPageSize = int32(len(compressedData))
   808  	page.Header.UncompressedPageSize = int32(len(data))
   809  	page.Header.DataPageHeader = parquet.NewDataPageHeader()
   810  	page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels))
   811  	page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
   812  	page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
   813  	page.Header.DataPageHeader.Encoding = parquet.Encoding_PLAIN_DICTIONARY
   814  
   815  	ts := thrift.NewTSerializer()
   816  	ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
   817  	pageHeaderBytes, err := ts.Write(context.TODO(), page.Header)
   818  	if err != nil {
   819  		panic(err)
   820  	}
   821  
   822  	page.RawData = append(pageHeaderBytes, compressedData...)
   823  	return page.RawData
   824  }