storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/data/column.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2019 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package data
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"strings"
    24  
    25  	"git.apache.org/thrift.git/lib/go/thrift"
    26  	"github.com/tidwall/gjson"
    27  	"github.com/tidwall/sjson"
    28  
    29  	"storj.io/minio/pkg/s3select/internal/parquet-go/common"
    30  	"storj.io/minio/pkg/s3select/internal/parquet-go/encoding"
    31  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    32  	"storj.io/minio/pkg/s3select/internal/parquet-go/schema"
    33  )
    34  
    35  func getDefaultEncoding(parquetType parquet.Type) parquet.Encoding {
    36  	switch parquetType {
    37  	case parquet.Type_BOOLEAN:
    38  		return parquet.Encoding_PLAIN
    39  	case parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE:
    40  		return parquet.Encoding_RLE_DICTIONARY
    41  	case parquet.Type_BYTE_ARRAY:
    42  		return parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY
    43  	}
    44  
    45  	return parquet.Encoding_PLAIN
    46  }
    47  
    48  func getFirstValueElement(tree *schema.Tree) (valueElement *schema.Element) {
    49  	tree.Range(func(name string, element *schema.Element) bool {
    50  		if element.Children == nil {
    51  			valueElement = element
    52  		} else {
    53  			valueElement = getFirstValueElement(element.Children)
    54  		}
    55  
    56  		return false
    57  	})
    58  
    59  	return valueElement
    60  }
    61  
    62  func populate(columnDataMap map[string]*Column, input *jsonValue, tree *schema.Tree, firstValueRL int64) (map[string]*Column, error) {
    63  	var err error
    64  
    65  	pos := 0
    66  	handleElement := func(name string, element *schema.Element) bool {
    67  		pos++
    68  
    69  		dataPath := element.PathInTree
    70  
    71  		if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
    72  			panic(fmt.Errorf("%v: repetition type must be REQUIRED or OPTIONAL type", dataPath))
    73  		}
    74  
    75  		inputValue := input.Get(name)
    76  		if *element.RepetitionType == parquet.FieldRepetitionType_REQUIRED && inputValue.IsNull() {
    77  			err = fmt.Errorf("%v: nil value for required field", dataPath)
    78  			return false
    79  		}
    80  
    81  		add := func(element *schema.Element, value interface{}, DL, RL int64) {
    82  			columnData := columnDataMap[element.PathInSchema]
    83  			if columnData == nil {
    84  				columnData = NewColumn(*element.Type)
    85  			}
    86  			columnData.add(value, DL, RL)
    87  			columnDataMap[element.PathInSchema] = columnData
    88  		}
    89  
    90  		// Handle primitive type element.
    91  		if element.Type != nil {
    92  			var value interface{}
    93  			if value, err = inputValue.GetValue(*element.Type, element.ConvertedType); err != nil {
    94  				return false
    95  			}
    96  
    97  			DL := element.MaxDefinitionLevel
    98  			if value == nil && DL > 0 {
    99  				DL--
   100  			}
   101  
   102  			RL := element.MaxRepetitionLevel
   103  			if pos == 1 {
   104  				RL = firstValueRL
   105  			}
   106  
   107  			add(element, value, DL, RL)
   108  			return true
   109  		}
   110  
   111  		addNull := func() {
   112  			valueElement := getFirstValueElement(element.Children)
   113  
   114  			DL := element.MaxDefinitionLevel
   115  			if DL > 0 {
   116  				DL--
   117  			}
   118  
   119  			RL := element.MaxRepetitionLevel
   120  			if RL > 0 {
   121  				RL--
   122  			}
   123  
   124  			add(valueElement, nil, DL, RL)
   125  		}
   126  
   127  		// Handle group type element.
   128  		if element.ConvertedType == nil {
   129  			if inputValue.IsNull() {
   130  				addNull()
   131  				return true
   132  			}
   133  
   134  			columnDataMap, err = populate(columnDataMap, inputValue, element.Children, firstValueRL)
   135  			return (err == nil)
   136  		}
   137  
   138  		// Handle list type element.
   139  		if *element.ConvertedType == parquet.ConvertedType_LIST {
   140  			if inputValue.IsNull() {
   141  				addNull()
   142  				return true
   143  			}
   144  
   145  			var results []gjson.Result
   146  			if results, err = inputValue.GetArray(); err != nil {
   147  				return false
   148  			}
   149  
   150  			listElement, _ := element.Children.Get("list")
   151  			valueElement, _ := listElement.Children.Get("element")
   152  			for i := range results {
   153  				rl := valueElement.MaxRepetitionLevel
   154  				if i == 0 {
   155  					rl = firstValueRL
   156  				}
   157  
   158  				var jsonData []byte
   159  				if jsonData, err = sjson.SetBytes([]byte{}, "element", results[i].Value()); err != nil {
   160  					return false
   161  				}
   162  
   163  				var jv *jsonValue
   164  				if jv, err = bytesToJSONValue(jsonData); err != nil {
   165  					return false
   166  				}
   167  
   168  				if columnDataMap, err = populate(columnDataMap, jv, listElement.Children, rl); err != nil {
   169  					return false
   170  				}
   171  			}
   172  			return true
   173  		}
   174  
   175  		if *element.ConvertedType == parquet.ConvertedType_MAP {
   176  			if inputValue.IsNull() {
   177  				addNull()
   178  				return true
   179  			}
   180  
   181  			keyValueElement, _ := element.Children.Get("key_value")
   182  			var rerr error
   183  			err = inputValue.Range(func(key, value gjson.Result) bool {
   184  				if !key.Exists() || key.Type == gjson.Null {
   185  					rerr = fmt.Errorf("%v.key_value.key: not found or null", dataPath)
   186  					return false
   187  				}
   188  
   189  				var jsonData []byte
   190  				if jsonData, rerr = sjson.SetBytes([]byte{}, "key", key.Value()); rerr != nil {
   191  					return false
   192  				}
   193  
   194  				if jsonData, rerr = sjson.SetBytes(jsonData, "value", value.Value()); rerr != nil {
   195  					return false
   196  				}
   197  
   198  				var jv *jsonValue
   199  				if jv, rerr = bytesToJSONValue(jsonData); rerr != nil {
   200  					return false
   201  				}
   202  
   203  				if columnDataMap, rerr = populate(columnDataMap, jv, keyValueElement.Children, firstValueRL); rerr != nil {
   204  					return false
   205  				}
   206  
   207  				return true
   208  			})
   209  
   210  			if err != nil {
   211  				return false
   212  			}
   213  
   214  			err = rerr
   215  			return (err == nil)
   216  		}
   217  
   218  		err = fmt.Errorf("%v: unsupported converted type %v in %v field type", dataPath, *element.ConvertedType, *element.RepetitionType)
   219  		return false
   220  	}
   221  
   222  	tree.Range(handleElement)
   223  	return columnDataMap, err
   224  }
   225  
   226  // Column - denotes values of a column.
   227  type Column struct {
   228  	parquetType      parquet.Type  // value type.
   229  	values           []interface{} // must be a slice of parquet typed values.
   230  	definitionLevels []int64       // exactly same length of values.
   231  	repetitionLevels []int64       // exactly same length of values.
   232  	rowCount         int32
   233  	maxBitWidth      int32
   234  	minValue         interface{}
   235  	maxValue         interface{}
   236  }
   237  
   238  func (column *Column) updateMinMaxValue(value interface{}) {
   239  	if column.minValue == nil && column.maxValue == nil {
   240  		column.minValue = value
   241  		column.maxValue = value
   242  		return
   243  	}
   244  
   245  	switch column.parquetType {
   246  	case parquet.Type_BOOLEAN:
   247  		if column.minValue.(bool) && !value.(bool) {
   248  			column.minValue = value
   249  		}
   250  
   251  		if !column.maxValue.(bool) && value.(bool) {
   252  			column.maxValue = value
   253  		}
   254  
   255  	case parquet.Type_INT32:
   256  		if column.minValue.(int32) > value.(int32) {
   257  			column.minValue = value
   258  		}
   259  
   260  		if column.maxValue.(int32) < value.(int32) {
   261  			column.maxValue = value
   262  		}
   263  
   264  	case parquet.Type_INT64:
   265  		if column.minValue.(int64) > value.(int64) {
   266  			column.minValue = value
   267  		}
   268  
   269  		if column.maxValue.(int64) < value.(int64) {
   270  			column.maxValue = value
   271  		}
   272  
   273  	case parquet.Type_FLOAT:
   274  		if column.minValue.(float32) > value.(float32) {
   275  			column.minValue = value
   276  		}
   277  
   278  		if column.maxValue.(float32) < value.(float32) {
   279  			column.maxValue = value
   280  		}
   281  
   282  	case parquet.Type_DOUBLE:
   283  		if column.minValue.(float64) > value.(float64) {
   284  			column.minValue = value
   285  		}
   286  
   287  		if column.maxValue.(float64) < value.(float64) {
   288  			column.maxValue = value
   289  		}
   290  
   291  	case parquet.Type_BYTE_ARRAY:
   292  		if bytes.Compare(column.minValue.([]byte), value.([]byte)) > 0 {
   293  			column.minValue = value
   294  		}
   295  
   296  		if bytes.Compare(column.minValue.([]byte), value.([]byte)) < 0 {
   297  			column.maxValue = value
   298  		}
   299  	}
   300  }
   301  
   302  func (column *Column) updateStats(value interface{}, DL, RL int64) {
   303  	if RL == 0 {
   304  		column.rowCount++
   305  	}
   306  
   307  	if value == nil {
   308  		return
   309  	}
   310  
   311  	var bitWidth int32
   312  	switch column.parquetType {
   313  	case parquet.Type_BOOLEAN:
   314  		bitWidth = 1
   315  	case parquet.Type_INT32:
   316  		bitWidth = common.BitWidth(uint64(value.(int32)))
   317  	case parquet.Type_INT64:
   318  		bitWidth = common.BitWidth(uint64(value.(int64)))
   319  	case parquet.Type_FLOAT:
   320  		bitWidth = 32
   321  	case parquet.Type_DOUBLE:
   322  		bitWidth = 64
   323  	case parquet.Type_BYTE_ARRAY:
   324  		bitWidth = int32(len(value.([]byte)))
   325  	}
   326  	if column.maxBitWidth < bitWidth {
   327  		column.maxBitWidth = bitWidth
   328  	}
   329  
   330  	column.updateMinMaxValue(value)
   331  }
   332  
   333  func (column *Column) add(value interface{}, DL, RL int64) {
   334  	column.values = append(column.values, value)
   335  	column.definitionLevels = append(column.definitionLevels, DL)
   336  	column.repetitionLevels = append(column.repetitionLevels, RL)
   337  	column.updateStats(value, DL, RL)
   338  }
   339  
   340  // AddNull - adds nil value.
   341  func (column *Column) AddNull(DL, RL int64) {
   342  	column.add(nil, DL, RL)
   343  }
   344  
   345  // AddBoolean - adds boolean value.
   346  func (column *Column) AddBoolean(value bool, DL, RL int64) {
   347  	if column.parquetType != parquet.Type_BOOLEAN {
   348  		panic(fmt.Errorf("expected %v value", column.parquetType))
   349  	}
   350  
   351  	column.add(value, DL, RL)
   352  }
   353  
   354  // AddInt32 - adds int32 value.
   355  func (column *Column) AddInt32(value int32, DL, RL int64) {
   356  	if column.parquetType != parquet.Type_INT32 {
   357  		panic(fmt.Errorf("expected %v value", column.parquetType))
   358  	}
   359  
   360  	column.add(value, DL, RL)
   361  }
   362  
   363  // AddInt64 - adds int64 value.
   364  func (column *Column) AddInt64(value int64, DL, RL int64) {
   365  	if column.parquetType != parquet.Type_INT64 {
   366  		panic(fmt.Errorf("expected %v value", column.parquetType))
   367  	}
   368  
   369  	column.add(value, DL, RL)
   370  }
   371  
   372  // AddFloat - adds float32 value.
   373  func (column *Column) AddFloat(value float32, DL, RL int64) {
   374  	if column.parquetType != parquet.Type_FLOAT {
   375  		panic(fmt.Errorf("expected %v value", column.parquetType))
   376  	}
   377  
   378  	column.add(value, DL, RL)
   379  }
   380  
   381  // AddDouble - adds float64 value.
   382  func (column *Column) AddDouble(value float64, DL, RL int64) {
   383  	if column.parquetType != parquet.Type_DOUBLE {
   384  		panic(fmt.Errorf("expected %v value", column.parquetType))
   385  	}
   386  
   387  	column.add(value, DL, RL)
   388  }
   389  
   390  // AddByteArray - adds byte array value.
   391  func (column *Column) AddByteArray(value []byte, DL, RL int64) {
   392  	if column.parquetType != parquet.Type_BYTE_ARRAY {
   393  		panic(fmt.Errorf("expected %v value", column.parquetType))
   394  	}
   395  
   396  	column.add(value, DL, RL)
   397  }
   398  
   399  // Merge - merges columns.
   400  func (column *Column) Merge(column2 *Column) {
   401  	if column.parquetType != column2.parquetType {
   402  		panic(fmt.Errorf("merge differs in parquet type"))
   403  	}
   404  
   405  	column.values = append(column.values, column2.values...)
   406  	column.definitionLevels = append(column.definitionLevels, column2.definitionLevels...)
   407  	column.repetitionLevels = append(column.repetitionLevels, column2.repetitionLevels...)
   408  
   409  	column.rowCount += column2.rowCount
   410  	if column.maxBitWidth < column2.maxBitWidth {
   411  		column.maxBitWidth = column2.maxBitWidth
   412  	}
   413  
   414  	column.updateMinMaxValue(column2.minValue)
   415  	column.updateMinMaxValue(column2.maxValue)
   416  }
   417  
   418  func (column *Column) String() string {
   419  	var strs []string
   420  	strs = append(strs, fmt.Sprintf("parquetType: %v", column.parquetType))
   421  	strs = append(strs, fmt.Sprintf("values: %v", column.values))
   422  	strs = append(strs, fmt.Sprintf("definitionLevels: %v", column.definitionLevels))
   423  	strs = append(strs, fmt.Sprintf("repetitionLevels: %v", column.repetitionLevels))
   424  	strs = append(strs, fmt.Sprintf("rowCount: %v", column.rowCount))
   425  	strs = append(strs, fmt.Sprintf("maxBitWidth: %v", column.maxBitWidth))
   426  	strs = append(strs, fmt.Sprintf("minValue: %v", column.minValue))
   427  	strs = append(strs, fmt.Sprintf("maxValue: %v", column.maxValue))
   428  	return "{" + strings.Join(strs, ", ") + "}"
   429  }
   430  
   431  func (column *Column) encodeValue(value interface{}, element *schema.Element) []byte {
   432  	if value == nil {
   433  		return nil
   434  	}
   435  
   436  	valueData := encoding.PlainEncode(common.ToSliceValue([]interface{}{value}, column.parquetType), column.parquetType)
   437  	if column.parquetType == parquet.Type_BYTE_ARRAY && element.ConvertedType != nil {
   438  		switch *element.ConvertedType {
   439  		case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL:
   440  			valueData = valueData[4:]
   441  		}
   442  	}
   443  
   444  	return valueData
   445  }
   446  
   447  func (column *Column) toDataPageV2(element *schema.Element, parquetEncoding parquet.Encoding) *ColumnChunk {
   448  	var definedValues []interface{}
   449  	for _, value := range column.values {
   450  		if value != nil {
   451  			definedValues = append(definedValues, value)
   452  		}
   453  	}
   454  
   455  	var encodedData []byte
   456  	switch parquetEncoding {
   457  	case parquet.Encoding_PLAIN:
   458  		encodedData = encoding.PlainEncode(common.ToSliceValue(definedValues, column.parquetType), column.parquetType)
   459  
   460  	case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
   461  		var bytesSlices [][]byte
   462  		for _, value := range column.values {
   463  			bytesSlices = append(bytesSlices, value.([]byte))
   464  		}
   465  		encodedData = encoding.DeltaLengthByteArrayEncode(bytesSlices)
   466  	}
   467  
   468  	compressionType := parquet.CompressionCodec_SNAPPY
   469  	if element.CompressionType != nil {
   470  		compressionType = *element.CompressionType
   471  	}
   472  
   473  	compressedData, err := common.Compress(compressionType, encodedData)
   474  	if err != nil {
   475  		panic(err)
   476  	}
   477  
   478  	DLData := encoding.RLEBitPackedHybridEncode(
   479  		column.definitionLevels,
   480  		common.BitWidth(uint64(element.MaxDefinitionLevel)),
   481  		parquet.Type_INT64,
   482  	)
   483  
   484  	RLData := encoding.RLEBitPackedHybridEncode(
   485  		column.repetitionLevels,
   486  		common.BitWidth(uint64(element.MaxRepetitionLevel)),
   487  		parquet.Type_INT64,
   488  	)
   489  
   490  	pageHeader := parquet.NewPageHeader()
   491  	pageHeader.Type = parquet.PageType_DATA_PAGE_V2
   492  	pageHeader.CompressedPageSize = int32(len(compressedData) + len(DLData) + len(RLData))
   493  	pageHeader.UncompressedPageSize = int32(len(encodedData) + len(DLData) + len(RLData))
   494  	pageHeader.DataPageHeaderV2 = parquet.NewDataPageHeaderV2()
   495  	pageHeader.DataPageHeaderV2.NumValues = int32(len(column.values))
   496  	pageHeader.DataPageHeaderV2.NumNulls = int32(len(column.values) - len(definedValues))
   497  	pageHeader.DataPageHeaderV2.NumRows = column.rowCount
   498  	pageHeader.DataPageHeaderV2.Encoding = parquetEncoding
   499  	pageHeader.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(DLData))
   500  	pageHeader.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(RLData))
   501  	pageHeader.DataPageHeaderV2.IsCompressed = true
   502  	pageHeader.DataPageHeaderV2.Statistics = parquet.NewStatistics()
   503  	pageHeader.DataPageHeaderV2.Statistics.Min = column.encodeValue(column.minValue, element)
   504  	pageHeader.DataPageHeaderV2.Statistics.Max = column.encodeValue(column.maxValue, element)
   505  
   506  	ts := thrift.NewTSerializer()
   507  	ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
   508  	rawData, err := ts.Write(context.TODO(), pageHeader)
   509  	if err != nil {
   510  		panic(err)
   511  	}
   512  	rawData = append(rawData, RLData...)
   513  	rawData = append(rawData, DLData...)
   514  	rawData = append(rawData, compressedData...)
   515  
   516  	metadata := parquet.NewColumnMetaData()
   517  	metadata.Type = column.parquetType
   518  	metadata.Encodings = []parquet.Encoding{
   519  		parquet.Encoding_PLAIN,
   520  		parquet.Encoding_RLE,
   521  		parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
   522  	}
   523  	metadata.Codec = compressionType
   524  	metadata.NumValues = int64(pageHeader.DataPageHeaderV2.NumValues)
   525  	metadata.TotalCompressedSize = int64(len(rawData))
   526  	metadata.TotalUncompressedSize = int64(pageHeader.UncompressedPageSize) + int64(len(rawData)) - int64(pageHeader.CompressedPageSize)
   527  	metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
   528  	metadata.Statistics = parquet.NewStatistics()
   529  	metadata.Statistics.Min = pageHeader.DataPageHeaderV2.Statistics.Min
   530  	metadata.Statistics.Max = pageHeader.DataPageHeaderV2.Statistics.Max
   531  
   532  	chunk := new(ColumnChunk)
   533  	chunk.ColumnChunk.MetaData = metadata
   534  	chunk.dataPageLen = int64(len(rawData))
   535  	chunk.dataLen = int64(len(rawData))
   536  	chunk.data = rawData
   537  
   538  	return chunk
   539  }
   540  
   541  func (column *Column) toRLEDictPage(element *schema.Element) *ColumnChunk {
   542  	dictPageData, dataPageData, dictValueCount, indexBitWidth := encoding.RLEDictEncode(column.values, column.parquetType, column.maxBitWidth)
   543  
   544  	compressionType := parquet.CompressionCodec_SNAPPY
   545  	if element.CompressionType != nil {
   546  		compressionType = *element.CompressionType
   547  	}
   548  
   549  	compressedData, err := common.Compress(compressionType, dictPageData)
   550  	if err != nil {
   551  		panic(err)
   552  	}
   553  
   554  	dictPageHeader := parquet.NewPageHeader()
   555  	dictPageHeader.Type = parquet.PageType_DICTIONARY_PAGE
   556  	dictPageHeader.CompressedPageSize = int32(len(compressedData))
   557  	dictPageHeader.UncompressedPageSize = int32(len(dictPageData))
   558  	dictPageHeader.DictionaryPageHeader = parquet.NewDictionaryPageHeader()
   559  	dictPageHeader.DictionaryPageHeader.NumValues = dictValueCount
   560  	dictPageHeader.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN
   561  
   562  	ts := thrift.NewTSerializer()
   563  	ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
   564  	dictPageRawData, err := ts.Write(context.TODO(), dictPageHeader)
   565  	if err != nil {
   566  		panic(err)
   567  	}
   568  	dictPageRawData = append(dictPageRawData, compressedData...)
   569  
   570  	RLData := encoding.RLEBitPackedHybridEncode(
   571  		column.repetitionLevels,
   572  		common.BitWidth(uint64(element.MaxRepetitionLevel)),
   573  		parquet.Type_INT64,
   574  	)
   575  	encodedData := RLData
   576  
   577  	DLData := encoding.RLEBitPackedHybridEncode(
   578  		column.definitionLevels,
   579  		common.BitWidth(uint64(element.MaxDefinitionLevel)),
   580  		parquet.Type_INT64,
   581  	)
   582  	encodedData = append(encodedData, DLData...)
   583  
   584  	encodedData = append(encodedData, indexBitWidth)
   585  	encodedData = append(encodedData, dataPageData...)
   586  
   587  	compressedData, err = common.Compress(compressionType, encodedData)
   588  	if err != nil {
   589  		panic(err)
   590  	}
   591  
   592  	dataPageHeader := parquet.NewPageHeader()
   593  	dataPageHeader.Type = parquet.PageType_DATA_PAGE
   594  	dataPageHeader.CompressedPageSize = int32(len(compressedData))
   595  	dataPageHeader.UncompressedPageSize = int32(len(encodedData))
   596  	dataPageHeader.DataPageHeader = parquet.NewDataPageHeader()
   597  	dataPageHeader.DataPageHeader.NumValues = int32(len(column.values))
   598  	dataPageHeader.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE
   599  	dataPageHeader.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE
   600  	dataPageHeader.DataPageHeader.Encoding = parquet.Encoding_RLE_DICTIONARY
   601  
   602  	ts = thrift.NewTSerializer()
   603  	ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport)
   604  	dataPageRawData, err := ts.Write(context.TODO(), dataPageHeader)
   605  	if err != nil {
   606  		panic(err)
   607  	}
   608  	dataPageRawData = append(dataPageRawData, compressedData...)
   609  
   610  	metadata := parquet.NewColumnMetaData()
   611  	metadata.Type = column.parquetType
   612  	metadata.Encodings = []parquet.Encoding{
   613  		parquet.Encoding_PLAIN,
   614  		parquet.Encoding_RLE,
   615  		parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY,
   616  		parquet.Encoding_RLE_DICTIONARY,
   617  	}
   618  	metadata.Codec = compressionType
   619  	metadata.NumValues = int64(dataPageHeader.DataPageHeader.NumValues)
   620  	metadata.TotalCompressedSize = int64(len(dictPageRawData)) + int64(len(dataPageRawData))
   621  	uncompressedSize := int64(dictPageHeader.UncompressedPageSize) + int64(len(dictPageData)) - int64(dictPageHeader.CompressedPageSize)
   622  	uncompressedSize += int64(dataPageHeader.UncompressedPageSize) + int64(len(dataPageData)) - int64(dataPageHeader.CompressedPageSize)
   623  	metadata.TotalUncompressedSize = uncompressedSize
   624  	metadata.PathInSchema = strings.Split(element.PathInSchema, ".")
   625  	metadata.Statistics = parquet.NewStatistics()
   626  	metadata.Statistics.Min = column.encodeValue(column.minValue, element)
   627  	metadata.Statistics.Max = column.encodeValue(column.maxValue, element)
   628  
   629  	chunk := new(ColumnChunk)
   630  	chunk.ColumnChunk.MetaData = metadata
   631  	chunk.isDictPage = true
   632  	chunk.dictPageLen = int64(len(dictPageRawData))
   633  	chunk.dataPageLen = int64(len(dataPageRawData))
   634  	chunk.dataLen = chunk.dictPageLen + chunk.dataPageLen
   635  	chunk.data = append(dictPageRawData, dataPageRawData...)
   636  
   637  	return chunk
   638  }
   639  
   640  // Encode an element.
   641  func (column *Column) Encode(element *schema.Element) *ColumnChunk {
   642  	parquetEncoding := getDefaultEncoding(column.parquetType)
   643  	if element.Encoding != nil {
   644  		parquetEncoding = *element.Encoding
   645  	}
   646  
   647  	switch parquetEncoding {
   648  	case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
   649  		return column.toDataPageV2(element, parquetEncoding)
   650  	}
   651  
   652  	return column.toRLEDictPage(element)
   653  }
   654  
   655  // NewColumn - creates new column data
   656  func NewColumn(parquetType parquet.Type) *Column {
   657  	switch parquetType {
   658  	case parquet.Type_BOOLEAN, parquet.Type_INT32, parquet.Type_INT64, parquet.Type_FLOAT, parquet.Type_DOUBLE, parquet.Type_BYTE_ARRAY:
   659  	default:
   660  		panic(fmt.Errorf("unsupported parquet type %v", parquetType))
   661  	}
   662  
   663  	return &Column{
   664  		parquetType: parquetType,
   665  	}
   666  }
   667  
   668  // UnmarshalJSON - decodes JSON data into map of Column.
   669  func UnmarshalJSON(data []byte, tree *schema.Tree) (map[string]*Column, error) {
   670  	if !tree.ReadOnly() {
   671  		return nil, fmt.Errorf("tree must be read only")
   672  	}
   673  
   674  	inputValue, err := bytesToJSONValue(data)
   675  	if err != nil {
   676  		return nil, err
   677  	}
   678  
   679  	columnDataMap := make(map[string]*Column)
   680  	return populate(columnDataMap, inputValue, tree, 0)
   681  }