github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/column.go (about)

     1  package parquet
     2  
     3  import (
     4  	"encoding/binary"
     5  	"fmt"
     6  	"io"
     7  	"reflect"
     8  
     9  	"github.com/vc42/parquet-go/compress"
    10  	"github.com/vc42/parquet-go/deprecated"
    11  	"github.com/vc42/parquet-go/encoding"
    12  	"github.com/vc42/parquet-go/format"
    13  )
    14  
    15  // Column represents a column in a parquet file.
    16  //
    17  // Methods of Column values are safe to call concurrently from multiple
    18  // goroutines.
    19  //
    20  // Column instances satisfy the Node interface.
    21  type Column struct {
    22  	typ         Type
    23  	file        *File
    24  	schema      *format.SchemaElement
    25  	order       *format.ColumnOrder
    26  	path        columnPath
    27  	columns     []*Column
    28  	chunks      []*format.ColumnChunk
    29  	columnIndex []*format.ColumnIndex
    30  	offsetIndex []*format.OffsetIndex
    31  	encoding    encoding.Encoding
    32  	compression compress.Codec
    33  
    34  	depth              int8
    35  	maxRepetitionLevel byte
    36  	maxDefinitionLevel byte
    37  	index              int16
    38  }
    39  
    40  // Type returns the type of the column.
    41  //
    42  // The returned value is unspecified if c is not a leaf column.
    43  func (c *Column) Type() Type { return c.typ }
    44  
    45  // Optional returns true if the column is optional.
    46  func (c *Column) Optional() bool { return schemaRepetitionTypeOf(c.schema) == format.Optional }
    47  
    48  // Repeated returns true if the column may repeat.
    49  func (c *Column) Repeated() bool { return schemaRepetitionTypeOf(c.schema) == format.Repeated }
    50  
    51  // Required returns true if the column is required.
    52  func (c *Column) Required() bool { return schemaRepetitionTypeOf(c.schema) == format.Required }
    53  
    54  // Leaf returns true if c is a leaf column.
    55  func (c *Column) Leaf() bool { return c.index >= 0 }
    56  
    57  // Fields returns the list of fields on the column.
    58  func (c *Column) Fields() []Field {
    59  	fields := make([]Field, len(c.columns))
    60  	for i, column := range c.columns {
    61  		fields[i] = column
    62  	}
    63  	return fields
    64  }
    65  
    66  // Encoding returns the encodings used by this column.
    67  func (c *Column) Encoding() encoding.Encoding { return c.encoding }
    68  
    69  // Compression returns the compression codecs used by this column.
    70  func (c *Column) Compression() compress.Codec { return c.compression }
    71  
    72  // Path of the column in the parquet schema.
    73  func (c *Column) Path() []string { return c.path }
    74  
    75  // Name returns the column name.
    76  func (c *Column) Name() string { return c.schema.Name }
    77  
    78  // Columns returns the list of child columns.
    79  //
    80  // The method returns the same slice across multiple calls, the program must
    81  // treat it as a read-only value.
    82  func (c *Column) Columns() []*Column { return c.columns }
    83  
    84  // Column returns the child column matching the given name.
    85  func (c *Column) Column(name string) *Column {
    86  	for _, child := range c.columns {
    87  		if child.Name() == name {
    88  			return child
    89  		}
    90  	}
    91  	return nil
    92  }
    93  
    94  // Pages returns a reader exposing all pages in this column, across row groups.
    95  func (c *Column) Pages() Pages {
    96  	if c.index < 0 {
    97  		return emptyPages{}
    98  	}
    99  	r := &columnPages{
   100  		pages: make([]filePages, len(c.file.rowGroups)),
   101  	}
   102  	for i := range r.pages {
   103  		r.pages[i].init(c.file.rowGroups[i].(*fileRowGroup).columns[c.index].(*fileColumnChunk))
   104  	}
   105  	return r
   106  }
   107  
   108  type columnPages struct {
   109  	pages []filePages
   110  	index int
   111  }
   112  
   113  func (c *columnPages) ReadPage() (Page, error) {
   114  	for {
   115  		if c.index >= len(c.pages) {
   116  			return nil, io.EOF
   117  		}
   118  		p, err := c.pages[c.index].ReadPage()
   119  		if err == nil || err != io.EOF {
   120  			return p, err
   121  		}
   122  		c.index++
   123  	}
   124  }
   125  
   126  func (c *columnPages) SeekToRow(rowIndex int64) error {
   127  	c.index = 0
   128  
   129  	for c.index < len(c.pages) && c.pages[c.index].chunk.rowGroup.NumRows >= rowIndex {
   130  		rowIndex -= c.pages[c.index].chunk.rowGroup.NumRows
   131  		c.index++
   132  	}
   133  
   134  	if c.index < len(c.pages) {
   135  		if err := c.pages[c.index].SeekToRow(rowIndex); err != nil {
   136  			return err
   137  		}
   138  		for i := range c.pages[c.index:] {
   139  			p := &c.pages[c.index+i]
   140  			if err := p.SeekToRow(0); err != nil {
   141  				return err
   142  			}
   143  		}
   144  	}
   145  	return nil
   146  }
   147  
   148  func (c *columnPages) Close() error {
   149  	var lastErr error
   150  
   151  	for i := range c.pages {
   152  		if err := c.pages[i].Close(); err != nil {
   153  			lastErr = err
   154  		}
   155  	}
   156  
   157  	c.pages = nil
   158  	c.index = 0
   159  	return lastErr
   160  }
   161  
   162  // Depth returns the position of the column relative to the root.
   163  func (c *Column) Depth() int { return int(c.depth) }
   164  
   165  // MaxRepetitionLevel returns the maximum value of repetition levels on this
   166  // column.
   167  func (c *Column) MaxRepetitionLevel() int { return int(c.maxRepetitionLevel) }
   168  
   169  // MaxDefinitionLevel returns the maximum value of definition levels on this
   170  // column.
   171  func (c *Column) MaxDefinitionLevel() int { return int(c.maxDefinitionLevel) }
   172  
   173  // Index returns the position of the column in a row. Only leaf columns have a
   174  // column index, the method returns -1 when called on non-leaf columns.
   175  func (c *Column) Index() int { return int(c.index) }
   176  
   177  // GoType returns the Go type that best represents the parquet column.
   178  func (c *Column) GoType() reflect.Type { return goTypeOf(c) }
   179  
   180  // Value returns the sub-value in base for the child column at the given
   181  // index.
   182  func (c *Column) Value(base reflect.Value) reflect.Value {
   183  	return base.MapIndex(reflect.ValueOf(&c.schema.Name).Elem())
   184  }
   185  
   186  // String returns a human-readable string representation of the column.
   187  func (c *Column) String() string { return c.path.String() + ": " + sprint(c.Name(), c) }
   188  
   189  func (c *Column) forEachLeaf(do func(*Column)) {
   190  	if len(c.columns) == 0 {
   191  		do(c)
   192  	} else {
   193  		for _, child := range c.columns {
   194  			child.forEachLeaf(do)
   195  		}
   196  	}
   197  }
   198  
   199  func openColumns(file *File) (*Column, error) {
   200  	cl := columnLoader{}
   201  
   202  	c, err := cl.open(file, nil)
   203  	if err != nil {
   204  		return nil, err
   205  	}
   206  
   207  	// Validate that there aren't extra entries in the row group columns,
   208  	// which would otherwise indicate that there are dangling data pages
   209  	// in the file.
   210  	for index, rowGroup := range file.metadata.RowGroups {
   211  		if cl.rowGroupColumnIndex != len(rowGroup.Columns) {
   212  			return nil, fmt.Errorf("row group at index %d contains %d columns but %d were referenced by the column schemas",
   213  				index, len(rowGroup.Columns), cl.rowGroupColumnIndex)
   214  		}
   215  	}
   216  
   217  	_, err = c.setLevels(0, 0, 0, 0)
   218  	return c, err
   219  }
   220  
   221  func (c *Column) setLevels(depth, repetition, definition, index int) (int, error) {
   222  	if depth > MaxColumnDepth {
   223  		return -1, fmt.Errorf("cannot represent parquet columns with more than %d nested levels: %s", MaxColumnDepth, c.path)
   224  	}
   225  	if index > MaxColumnIndex {
   226  		return -1, fmt.Errorf("cannot represent parquet rows with more than %d columns: %s", MaxColumnIndex, c.path)
   227  	}
   228  	if repetition > MaxRepetitionLevel {
   229  		return -1, fmt.Errorf("cannot represent parquet columns with more than %d repetition levels: %s", MaxRepetitionLevel, c.path)
   230  	}
   231  	if definition > MaxDefinitionLevel {
   232  		return -1, fmt.Errorf("cannot represent parquet columns with more than %d definition levels: %s", MaxDefinitionLevel, c.path)
   233  	}
   234  
   235  	switch schemaRepetitionTypeOf(c.schema) {
   236  	case format.Optional:
   237  		definition++
   238  	case format.Repeated:
   239  		repetition++
   240  		definition++
   241  	}
   242  
   243  	c.depth = int8(depth)
   244  	c.maxRepetitionLevel = byte(repetition)
   245  	c.maxDefinitionLevel = byte(definition)
   246  	depth++
   247  
   248  	if len(c.columns) > 0 {
   249  		c.index = -1
   250  	} else {
   251  		c.index = int16(index)
   252  		index++
   253  	}
   254  
   255  	var err error
   256  	for _, child := range c.columns {
   257  		if index, err = child.setLevels(depth, repetition, definition, index); err != nil {
   258  			return -1, err
   259  		}
   260  	}
   261  	return index, nil
   262  }
   263  
   264  type columnLoader struct {
   265  	schemaIndex         int
   266  	columnOrderIndex    int
   267  	rowGroupColumnIndex int
   268  }
   269  
   270  func (cl *columnLoader) open(file *File, path []string) (*Column, error) {
   271  	c := &Column{
   272  		file:   file,
   273  		schema: &file.metadata.Schema[cl.schemaIndex],
   274  	}
   275  	c.path = c.path.append(c.schema.Name)
   276  
   277  	cl.schemaIndex++
   278  	numChildren := int(c.schema.NumChildren)
   279  
   280  	if numChildren == 0 {
   281  		c.typ = schemaElementTypeOf(c.schema)
   282  
   283  		if cl.columnOrderIndex < len(file.metadata.ColumnOrders) {
   284  			c.order = &file.metadata.ColumnOrders[cl.columnOrderIndex]
   285  			cl.columnOrderIndex++
   286  		}
   287  
   288  		rowGroups := file.metadata.RowGroups
   289  		rowGroupColumnIndex := cl.rowGroupColumnIndex
   290  		cl.rowGroupColumnIndex++
   291  
   292  		c.chunks = make([]*format.ColumnChunk, 0, len(rowGroups))
   293  		c.columnIndex = make([]*format.ColumnIndex, 0, len(rowGroups))
   294  		c.offsetIndex = make([]*format.OffsetIndex, 0, len(rowGroups))
   295  
   296  		for i, rowGroup := range rowGroups {
   297  			if rowGroupColumnIndex >= len(rowGroup.Columns) {
   298  				return nil, fmt.Errorf("row group at index %d does not have enough columns", i)
   299  			}
   300  			c.chunks = append(c.chunks, &rowGroup.Columns[rowGroupColumnIndex])
   301  		}
   302  
   303  		if len(file.columnIndexes) > 0 {
   304  			for i := range rowGroups {
   305  				if rowGroupColumnIndex >= len(file.columnIndexes) {
   306  					return nil, fmt.Errorf("row group at index %d does not have enough column index pages", i)
   307  				}
   308  				c.columnIndex = append(c.columnIndex, &file.columnIndexes[rowGroupColumnIndex])
   309  			}
   310  		}
   311  
   312  		if len(file.offsetIndexes) > 0 {
   313  			for i := range rowGroups {
   314  				if rowGroupColumnIndex >= len(file.offsetIndexes) {
   315  					return nil, fmt.Errorf("row group at index %d does not have enough offset index pages", i)
   316  				}
   317  				c.offsetIndex = append(c.offsetIndex, &file.offsetIndexes[rowGroupColumnIndex])
   318  			}
   319  		}
   320  
   321  		if len(c.chunks) > 0 {
   322  			// Pick the encoding and compression codec of the first chunk.
   323  			//
   324  			// Technically each column chunk may use a different compression
   325  			// codec, and each page of the column chunk might have a different
   326  			// encoding. Exposing these details does not provide a lot of value
   327  			// to the end user.
   328  			//
   329  			// Programs that wish to determine the encoding and compression of
   330  			// each page of the column should iterate through the pages and read
   331  			// the page headers to determine which compression and encodings are
   332  			// applied.
   333  			for _, encoding := range c.chunks[0].MetaData.Encoding {
   334  				c.encoding = LookupEncoding(encoding)
   335  				break
   336  			}
   337  			c.compression = LookupCompressionCodec(c.chunks[0].MetaData.Codec)
   338  		}
   339  
   340  		return c, nil
   341  	}
   342  
   343  	c.typ = &groupType{}
   344  	c.columns = make([]*Column, numChildren)
   345  
   346  	for i := range c.columns {
   347  		if cl.schemaIndex >= len(file.metadata.Schema) {
   348  			return nil, fmt.Errorf("column %q has more children than there are schemas in the file: %d > %d",
   349  				c.schema.Name, cl.schemaIndex+1, len(file.metadata.Schema))
   350  		}
   351  
   352  		var err error
   353  		c.columns[i], err = cl.open(file, path)
   354  		if err != nil {
   355  			return nil, fmt.Errorf("%s: %w", c.schema.Name, err)
   356  		}
   357  	}
   358  
   359  	return c, nil
   360  }
   361  
   362  func schemaElementTypeOf(s *format.SchemaElement) Type {
   363  	if lt := s.LogicalType; lt != nil {
   364  		// A logical type exists, the Type interface implementations in this
   365  		// package are all based on the logical parquet types declared in the
   366  		// format sub-package so we can return them directly via a pointer type
   367  		// conversion.
   368  		switch {
   369  		case lt.UTF8 != nil:
   370  			return (*stringType)(lt.UTF8)
   371  		case lt.Map != nil:
   372  			return (*mapType)(lt.Map)
   373  		case lt.List != nil:
   374  			return (*listType)(lt.List)
   375  		case lt.Enum != nil:
   376  			return (*enumType)(lt.Enum)
   377  		case lt.Decimal != nil:
   378  			// TODO:
   379  			// return (*decimalType)(lt.Decimal)
   380  		case lt.Date != nil:
   381  			return (*dateType)(lt.Date)
   382  		case lt.Time != nil:
   383  			return (*timeType)(lt.Time)
   384  		case lt.Timestamp != nil:
   385  			return (*timestampType)(lt.Timestamp)
   386  		case lt.Integer != nil:
   387  			return (*intType)(lt.Integer)
   388  		case lt.Unknown != nil:
   389  			return (*nullType)(lt.Unknown)
   390  		case lt.Json != nil:
   391  			return (*jsonType)(lt.Json)
   392  		case lt.Bson != nil:
   393  			return (*bsonType)(lt.Bson)
   394  		case lt.UUID != nil:
   395  			return (*uuidType)(lt.UUID)
   396  		}
   397  	}
   398  
   399  	if ct := s.ConvertedType; ct != nil {
   400  		// This column contains no logical type but has a converted type, it
   401  		// was likely created by an older parquet writer. Convert the legacy
   402  		// type representation to the equivalent logical parquet type.
   403  		switch *ct {
   404  		case deprecated.UTF8:
   405  			return &stringType{}
   406  		case deprecated.Map:
   407  			return &mapType{}
   408  		case deprecated.MapKeyValue:
   409  			return &groupType{}
   410  		case deprecated.List:
   411  			return &listType{}
   412  		case deprecated.Enum:
   413  			return &enumType{}
   414  		case deprecated.Decimal:
   415  			// TODO
   416  		case deprecated.Date:
   417  			return &dateType{}
   418  		case deprecated.TimeMillis:
   419  			return &timeType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()}
   420  		case deprecated.TimeMicros:
   421  			return &timeType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()}
   422  		case deprecated.TimestampMillis:
   423  			return &timestampType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()}
   424  		case deprecated.TimestampMicros:
   425  			return &timestampType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()}
   426  		case deprecated.Uint8:
   427  			return &unsignedIntTypes[0]
   428  		case deprecated.Uint16:
   429  			return &unsignedIntTypes[1]
   430  		case deprecated.Uint32:
   431  			return &unsignedIntTypes[2]
   432  		case deprecated.Uint64:
   433  			return &unsignedIntTypes[3]
   434  		case deprecated.Int8:
   435  			return &signedIntTypes[0]
   436  		case deprecated.Int16:
   437  			return &signedIntTypes[1]
   438  		case deprecated.Int32:
   439  			return &signedIntTypes[2]
   440  		case deprecated.Int64:
   441  			return &signedIntTypes[3]
   442  		case deprecated.Json:
   443  			return &jsonType{}
   444  		case deprecated.Bson:
   445  			return &bsonType{}
   446  		case deprecated.Interval:
   447  			// TODO
   448  		}
   449  	}
   450  
   451  	if t := s.Type; t != nil {
   452  		// The column only has a physical type, convert it to one of the
   453  		// primitive types supported by this package.
   454  		switch kind := Kind(*t); kind {
   455  		case Boolean:
   456  			return BooleanType
   457  		case Int32:
   458  			return Int32Type
   459  		case Int64:
   460  			return Int64Type
   461  		case Int96:
   462  			return Int96Type
   463  		case Float:
   464  			return FloatType
   465  		case Double:
   466  			return DoubleType
   467  		case ByteArray:
   468  			return ByteArrayType
   469  		case FixedLenByteArray:
   470  			if s.TypeLength != nil {
   471  				return FixedLenByteArrayType(int(*s.TypeLength))
   472  			}
   473  		}
   474  	}
   475  
   476  	// If we reach this point, we are likely reading a parquet column that was
   477  	// written with a non-standard type or is in a newer version of the format
   478  	// than this package supports.
   479  	return &nullType{}
   480  }
   481  
   482  func schemaRepetitionTypeOf(s *format.SchemaElement) format.FieldRepetitionType {
   483  	if s.RepetitionType != nil {
   484  		return *s.RepetitionType
   485  	}
   486  	return format.Required
   487  }
   488  
   489  type dictPage struct {
   490  	values []byte
   491  }
   492  
   493  func (p *dictPage) reset() {
   494  	p.values = p.values[:0]
   495  }
   496  
   497  type dataPage struct {
   498  	repetitionLevels []byte
   499  	definitionLevels []byte
   500  	data             []byte
   501  	values           []byte
   502  	dictionary       Dictionary
   503  }
   504  
   505  func (p *dataPage) reset() {
   506  	p.repetitionLevels = p.repetitionLevels[:0]
   507  	p.definitionLevels = p.definitionLevels[:0]
   508  	p.data = p.data[:0]
   509  	p.values = p.values[:0]
   510  	p.dictionary = nil
   511  }
   512  
   513  func (p *dataPage) decompress(codec compress.Codec, data []byte) (err error) {
   514  	p.values, err = codec.Decode(p.values, data)
   515  	p.data, p.values = p.values, p.data[:0]
   516  	return err
   517  }
   518  
   519  // DecodeDataPageV1 decodes a data page from the header, compressed data, and
   520  // optional dictionary passed as arguments.
   521  func (c *Column) DecodeDataPageV1(header DataPageHeaderV1, data []byte, dict Dictionary) (Page, error) {
   522  	return c.decodeDataPageV1(header, &dataPage{data: data, dictionary: dict})
   523  }
   524  
   525  func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *dataPage) (Page, error) {
   526  	var err error
   527  
   528  	if isCompressed(c.compression) {
   529  		if err := page.decompress(c.compression, page.data); err != nil {
   530  			return nil, fmt.Errorf("decompressing data page v1: %w", err)
   531  		}
   532  	}
   533  
   534  	numValues := header.NumValues()
   535  	data := page.data
   536  	page.repetitionLevels = page.repetitionLevels[:0]
   537  	page.definitionLevels = page.definitionLevels[:0]
   538  
   539  	if c.maxRepetitionLevel > 0 {
   540  		encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel)
   541  		page.repetitionLevels, data, err = decodeLevelsV1(encoding, numValues, page.repetitionLevels, data)
   542  		if err != nil {
   543  			return nil, fmt.Errorf("decoding repetition levels of data page v1: %w", err)
   544  		}
   545  	}
   546  
   547  	if c.maxDefinitionLevel > 0 {
   548  		encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel)
   549  		page.definitionLevels, data, err = decodeLevelsV1(encoding, numValues, page.definitionLevels, data)
   550  		if err != nil {
   551  			return nil, fmt.Errorf("decoding definition levels of data page v1: %w", err)
   552  		}
   553  
   554  		// Data pages v1 did not embed the number of null values,
   555  		// so we have to compute it from the definition levels.
   556  		numValues -= int64(countLevelsNotEqual(page.definitionLevels, c.maxDefinitionLevel))
   557  	}
   558  
   559  	return c.decodeDataPage(header, numValues, page, data)
   560  }
   561  
   562  // DecodeDataPageV2 decodes a data page from the header, compressed data, and
   563  // optional dictionary passed as arguments.
   564  func (c *Column) DecodeDataPageV2(header DataPageHeaderV2, data []byte, dict Dictionary) (Page, error) {
   565  	return c.decodeDataPageV2(header, &dataPage{data: data, dictionary: dict})
   566  }
   567  
   568  func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *dataPage) (Page, error) {
   569  	var numValues = header.NumValues()
   570  	var err error
   571  	var data = page.data
   572  	page.repetitionLevels = page.repetitionLevels[:0]
   573  	page.definitionLevels = page.definitionLevels[:0]
   574  
   575  	if c.maxRepetitionLevel > 0 {
   576  		encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel)
   577  		length := header.RepetitionLevelsByteLength()
   578  		page.repetitionLevels, data, err = decodeLevelsV2(encoding, numValues, page.repetitionLevels, data, length)
   579  		if err != nil {
   580  			return nil, fmt.Errorf("decoding repetition levels of data page v2: %w", io.ErrUnexpectedEOF)
   581  		}
   582  	}
   583  
   584  	if c.maxDefinitionLevel > 0 {
   585  		encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel)
   586  		length := header.DefinitionLevelsByteLength()
   587  		page.definitionLevels, data, err = decodeLevelsV2(encoding, numValues, page.definitionLevels, data, length)
   588  		if err != nil {
   589  			return nil, fmt.Errorf("decoding definition levels of data page v2: %w", io.ErrUnexpectedEOF)
   590  		}
   591  	}
   592  
   593  	if isCompressed(c.compression) && header.IsCompressed() {
   594  		if err := page.decompress(c.compression, data); err != nil {
   595  			return nil, fmt.Errorf("decompressing data page v2: %w", err)
   596  		}
   597  		data = page.data
   598  	}
   599  
   600  	numValues -= header.NumNulls()
   601  	return c.decodeDataPage(header, numValues, page, data)
   602  }
   603  
   604  func (c *Column) decodeDataPage(header DataPageHeader, numValues int64, page *dataPage, data []byte) (Page, error) {
   605  	encoding := LookupEncoding(header.Encoding())
   606  	pageType := c.Type()
   607  
   608  	if isDictionaryEncoding(encoding) {
   609  		// In some legacy configurations, the PLAIN_DICTIONARY encoding is used
   610  		// on data page headers to indicate that the page contains indexes into
   611  		// the dictionary page, but the page is still encoded using the RLE
   612  		// encoding in this case, so we convert it to RLE_DICTIONARY.
   613  		encoding = &RLEDictionary
   614  		pageType = indexedPageType{newIndexedType(pageType, page.dictionary)}
   615  	}
   616  
   617  	var err error
   618  	page.values, err = pageType.Decode(page.values, data, encoding)
   619  	if err != nil {
   620  		return nil, err
   621  	}
   622  
   623  	newPage := pageType.NewPage(c.Index(), int(numValues), page.values)
   624  	switch {
   625  	case c.maxRepetitionLevel > 0:
   626  		newPage = newRepeatedPage(newPage.Buffer(), c.maxRepetitionLevel, c.maxDefinitionLevel, page.repetitionLevels, page.definitionLevels)
   627  	case c.maxDefinitionLevel > 0:
   628  		newPage = newOptionalPage(newPage.Buffer(), c.maxDefinitionLevel, page.definitionLevels)
   629  	}
   630  	return newPage, nil
   631  }
   632  
   633  func decodeLevelsV1(enc encoding.Encoding, numValues int64, levels, data []byte) ([]byte, []byte, error) {
   634  	if len(data) < 4 {
   635  		return nil, data, io.ErrUnexpectedEOF
   636  	}
   637  	i := 4
   638  	j := 4 + int(binary.LittleEndian.Uint32(data))
   639  	if j > len(data) {
   640  		return nil, data, io.ErrUnexpectedEOF
   641  	}
   642  	levels, err := decodeLevels(enc, numValues, levels, data[i:j])
   643  	return levels, data[j:], err
   644  }
   645  
   646  func decodeLevelsV2(enc encoding.Encoding, numValues int64, levels, data []byte, length int64) ([]byte, []byte, error) {
   647  	if length > int64(len(data)) {
   648  		return nil, data, io.ErrUnexpectedEOF
   649  	}
   650  	levels, err := decodeLevels(enc, numValues, levels, data[:length])
   651  	return levels, data[length:], err
   652  }
   653  
   654  func decodeLevels(enc encoding.Encoding, numValues int64, levels, data []byte) ([]byte, error) {
   655  	if cap(levels) < int(numValues) {
   656  		levels = make([]byte, numValues)
   657  	}
   658  	levels, err := enc.DecodeLevels(levels, data)
   659  	if err == nil {
   660  		switch {
   661  		case len(levels) < int(numValues):
   662  			err = fmt.Errorf("decoding level expected %d values but got only %d", numValues, len(levels))
   663  		case len(levels) > int(numValues):
   664  			levels = levels[:numValues]
   665  		}
   666  	}
   667  	return levels, err
   668  }
   669  
   670  // DecodeDictionary decodes a data page from the header and compressed data
   671  // passed as arguments.
   672  func (c *Column) DecodeDictionary(header DictionaryPageHeader, data []byte) (Dictionary, error) {
   673  	return c.decodeDictionary(header, &dataPage{data: data}, &dictPage{})
   674  }
   675  
   676  func (c *Column) decodeDictionary(header DictionaryPageHeader, page *dataPage, dict *dictPage) (Dictionary, error) {
   677  	if isCompressed(c.compression) {
   678  		if err := page.decompress(c.compression, page.data); err != nil {
   679  			return nil, fmt.Errorf("decompressing dictionary page: %w", err)
   680  		}
   681  	}
   682  
   683  	pageType := c.Type()
   684  	encoding := header.Encoding()
   685  	if encoding == format.PlainDictionary {
   686  		encoding = format.Plain
   687  	}
   688  
   689  	var err error
   690  	page.values, err = pageType.Decode(page.values, page.data, LookupEncoding(encoding))
   691  	if err != nil {
   692  		return nil, err
   693  	}
   694  
   695  	dict.values = append(dict.values[:0], page.values...)
   696  	return pageType.NewDictionary(int(c.index), int(header.NumValues()), dict.values), nil
   697  }
   698  
   699  var (
   700  	_ Node = (*Column)(nil)
   701  )