github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/format/parquet.go

github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/format/parquet.go (about)

     1  package format
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/vc42/parquet-go/deprecated"
     7  )
     8  
     9  // Types supported by Parquet. These types are intended to be used in combination
    10  // with the encodings to control the on disk storage format. For example INT16
    11  // is not included as a type since a good encoding of INT32 would handle this.
    12  type Type int32
    13  
    14  const (
    15  	Boolean           Type = 0
    16  	Int32             Type = 1
    17  	Int64             Type = 2
    18  	Int96             Type = 3 // deprecated, only used by legacy implementations.
    19  	Float             Type = 4
    20  	Double            Type = 5
    21  	ByteArray         Type = 6
    22  	FixedLenByteArray Type = 7
    23  )
    24  
    25  func (t Type) String() string {
    26  	switch t {
    27  	case Boolean:
    28  		return "BOOLEAN"
    29  	case Int32:
    30  		return "INT32"
    31  	case Int64:
    32  		return "INT64"
    33  	case Int96:
    34  		return "INT96"
    35  	case Float:
    36  		return "FLOAT"
    37  	case Double:
    38  		return "DOUBLE"
    39  	case ByteArray:
    40  		return "BYTE_ARRAY"
    41  	case FixedLenByteArray:
    42  		return "FIXED_LEN_BYTE_ARRAY"
    43  	default:
    44  		return "Type(?)"
    45  	}
    46  }
    47  
    48  // Representation of Schemas.
    49  type FieldRepetitionType int32
    50  
    51  const (
    52  	// The field is required (can not be null) and each record has exactly 1 value.
    53  	Required FieldRepetitionType = 0
    54  	// The field is optional (can be null) and each record has 0 or 1 values.
    55  	Optional FieldRepetitionType = 1
    56  	// The field is repeated and can contain 0 or more values.
    57  	Repeated FieldRepetitionType = 2
    58  )
    59  
    60  func (t FieldRepetitionType) String() string {
    61  	switch t {
    62  	case Required:
    63  		return "REQUIRED"
    64  	case Optional:
    65  		return "OPTIONAL"
    66  	case Repeated:
    67  		return "REPEATED"
    68  	default:
    69  		return "FieldRepeationaType(?)"
    70  	}
    71  }
    72  
    73  // Statistics per row group and per page.
    74  // All fields are optional.
    75  type Statistics struct {
    76  	// DEPRECATED: min and max value of the column. Use min_value and max_value.
    77  	//
    78  	// Values are encoded using PLAIN encoding, except that variable-length byte
    79  	// arrays do not include a length prefix.
    80  	//
    81  	// These fields encode min and max values determined by signed comparison
    82  	// only. New files should use the correct order for a column's logical type
    83  	// and store the values in the min_value and max_value fields.
    84  	//
    85  	// To support older readers, these may be set when the column order is
    86  	// signed.
    87  	Max []byte `thrift:"1"`
    88  	Min []byte `thrift:"2"`
    89  	// Count of null value in the column.
    90  	NullCount int64 `thrift:"3"`
    91  	// Count of distinct values occurring.
    92  	DistinctCount int64 `thrift:"4"`
    93  	// Min and max values for the column, determined by its ColumnOrder.
    94  	//
    95  	// Values are encoded using PLAIN encoding, except that variable-length byte
    96  	// arrays do not include a length prefix.
    97  	MaxValue []byte `thrift:"5"`
    98  	MinValue []byte `thrift:"6"`
    99  }
   100  
   101  // Empty structs to use as logical type annotations.
   102  type StringType struct{} // allowed for BINARY, must be encoded with UTF-8
   103  type UUIDType struct{}   // allowed for FIXED[16], must encode raw UUID bytes
   104  type MapType struct{}    // see see LogicalTypes.md
   105  type ListType struct{}   // see LogicalTypes.md
   106  type EnumType struct{}   // allowed for BINARY, must be encoded with UTF-8
   107  type DateType struct{}   // allowed for INT32
   108  
   109  func (*StringType) String() string { return "STRING" }
   110  func (*UUIDType) String() string   { return "UUID" }
   111  func (*MapType) String() string    { return "MAP" }
   112  func (*ListType) String() string   { return "LIST" }
   113  func (*EnumType) String() string   { return "ENUM" }
   114  func (*DateType) String() string   { return "DATE" }
   115  
   116  // Logical type to annotate a column that is always null.
   117  //
   118  // Sometimes when discovering the schema of existing data, values are always
   119  // null and the physical type can't be determined. This annotation signals
   120  // the case where the physical type was guessed from all null values.
   121  type NullType struct{}
   122  
   123  func (*NullType) String() string { return "NULL" }
   124  
   125  // Decimal logical type annotation
   126  //
   127  // To maintain forward-compatibility in v1, implementations using this logical
   128  // type must also set scale and precision on the annotated SchemaElement.
   129  //
   130  // Allowed for physical types: INT32, INT64, FIXED, and BINARY
   131  type DecimalType struct {
   132  	Scale     int32 `thrift:"1,required"`
   133  	Precision int32 `thrift:"2,required"`
   134  }
   135  
   136  func (t *DecimalType) String() string {
   137  	return fmt.Sprintf("DECIMAL(%d,%d)", t.Scale, t.Precision)
   138  }
   139  
   140  // Time units for logical types.
   141  type MilliSeconds struct{}
   142  type MicroSeconds struct{}
   143  type NanoSeconds struct{}
   144  
   145  func (*MilliSeconds) String() string { return "MILLIS" }
   146  func (*MicroSeconds) String() string { return "MICROS" }
   147  func (*NanoSeconds) String() string  { return "NANOS" }
   148  
   149  type TimeUnit struct { // union
   150  	Millis *MilliSeconds `thrift:"1"`
   151  	Micros *MicroSeconds `thrift:"2"`
   152  	Nanos  *NanoSeconds  `thrift:"3"`
   153  }
   154  
   155  func (u *TimeUnit) String() string {
   156  	switch {
   157  	case u.Millis != nil:
   158  		return u.Millis.String()
   159  	case u.Micros != nil:
   160  		return u.Micros.String()
   161  	case u.Nanos != nil:
   162  		return u.Nanos.String()
   163  	default:
   164  		return ""
   165  	}
   166  }
   167  
   168  // Timestamp logical type annotation
   169  //
   170  // Allowed for physical types: INT64
   171  type TimestampType struct {
   172  	IsAdjustedToUTC bool     `thrift:"1,required"`
   173  	Unit            TimeUnit `thrift:"2,required"`
   174  }
   175  
   176  func (t *TimestampType) String() string {
   177  	return fmt.Sprintf("TIMESTAMP(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit)
   178  }
   179  
   180  // Time logical type annotation
   181  //
   182  // Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
   183  type TimeType struct {
   184  	IsAdjustedToUTC bool     `thrift:"1,required"`
   185  	Unit            TimeUnit `thrift:"2,required"`
   186  }
   187  
   188  func (t *TimeType) String() string {
   189  	return fmt.Sprintf("TIME(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit)
   190  }
   191  
   192  // Integer logical type annotation
   193  //
   194  // bitWidth must be 8, 16, 32, or 64.
   195  //
   196  // Allowed for physical types: INT32, INT64
   197  type IntType struct {
   198  	BitWidth int8 `thrift:"1,required"`
   199  	IsSigned bool `thrift:"2,required"`
   200  }
   201  
   202  func (t *IntType) String() string {
   203  	return fmt.Sprintf("INT(%d,%t)", t.BitWidth, t.IsSigned)
   204  }
   205  
   206  // Embedded JSON logical type annotation
   207  //
   208  // Allowed for physical types: BINARY
   209  type JsonType struct{}
   210  
   211  func (t *JsonType) String() string { return "JSON" }
   212  
   213  // Embedded BSON logical type annotation
   214  //
   215  // Allowed for physical types: BINARY
   216  type BsonType struct{}
   217  
   218  func (t *BsonType) String() string { return "BSON" }
   219  
   220  // LogicalType annotations to replace ConvertedType.
   221  //
   222  // To maintain compatibility, implementations using LogicalType for a
   223  // SchemaElement must also set the corresponding ConvertedType (if any)
   224  // from the following table.
   225  type LogicalType struct { // union
   226  	UTF8    *StringType  `thrift:"1"` // use ConvertedType UTF8
   227  	Map     *MapType     `thrift:"2"` // use ConvertedType Map
   228  	List    *ListType    `thrift:"3"` // use ConvertedType List
   229  	Enum    *EnumType    `thrift:"4"` // use ConvertedType Enum
   230  	Decimal *DecimalType `thrift:"5"` // use ConvertedType Decimal + SchemaElement.{Scale, Precision}
   231  	Date    *DateType    `thrift:"6"` // use ConvertedType Date
   232  
   233  	// use ConvertedType TimeMicros for Time{IsAdjustedToUTC: *, Unit: Micros}
   234  	// use ConvertedType TimeMillis for Time{IsAdjustedToUTC: *, Unit: Millis}
   235  	Time *TimeType `thrift:"7"`
   236  
   237  	// use ConvertedType TimestampMicros for Timestamp{IsAdjustedToUTC: *, Unit: Micros}
   238  	// use ConvertedType TimestampMillis for Timestamp{IsAdjustedToUTC: *, Unit: Millis}
   239  	Timestamp *TimestampType `thrift:"8"`
   240  
   241  	// 9: reserved for Interval
   242  	Integer *IntType  `thrift:"10"` // use ConvertedType Int* or Uint*
   243  	Unknown *NullType `thrift:"11"` // no compatible ConvertedType
   244  	Json    *JsonType `thrift:"12"` // use ConvertedType JSON
   245  	Bson    *BsonType `thrift:"13"` // use ConvertedType BSON
   246  	UUID    *UUIDType `thrift:"14"` // no compatible ConvertedType
   247  }
   248  
   249  func (t *LogicalType) String() string {
   250  	switch {
   251  	case t.UTF8 != nil:
   252  		return t.UTF8.String()
   253  	case t.Map != nil:
   254  		return t.Map.String()
   255  	case t.List != nil:
   256  		return t.List.String()
   257  	case t.Enum != nil:
   258  		return t.Enum.String()
   259  	case t.Decimal != nil:
   260  		return t.Decimal.String()
   261  	case t.Date != nil:
   262  		return t.Date.String()
   263  	case t.Time != nil:
   264  		return t.Time.String()
   265  	case t.Timestamp != nil:
   266  		return t.Timestamp.String()
   267  	case t.Integer != nil:
   268  		return t.Integer.String()
   269  	case t.Unknown != nil:
   270  		return t.Unknown.String()
   271  	case t.Json != nil:
   272  		return t.Json.String()
   273  	case t.Bson != nil:
   274  		return t.Bson.String()
   275  	case t.UUID != nil:
   276  		return t.UUID.String()
   277  	default:
   278  		return ""
   279  	}
   280  }
   281  
   282  // Represents a element inside a schema definition.
   283  //
   284  //	- if it is a group (inner node) then type is undefined and num_children is
   285  //    defined
   286  //
   287  //	- if it is a primitive type (leaf) then type is defined and num_children is
   288  //    undefined
   289  //
   290  // The nodes are listed in depth first traversal order.
   291  type SchemaElement struct {
   292  	// Data type for this field. Not set if the current element is a non-leaf node.
   293  	Type *Type `thrift:"1,optional"`
   294  
   295  	// If type is FixedLenByteArray, this is the byte length of the values.
   296  	// Otherwise, if specified, this is the maximum bit length to store any of the values.
   297  	// (e.g. a low cardinality INT col could have this set to 3).  Note that this is
   298  	// in the schema, and therefore fixed for the entire file.
   299  	TypeLength *int32 `thrift:"2,optional"`
   300  
   301  	// repetition of the field. The root of the schema does not have a repetition_type.
   302  	// All other nodes must have one.
   303  	RepetitionType *FieldRepetitionType `thrift:"3,optional"`
   304  
   305  	// Name of the field in the schema.
   306  	Name string `thrift:"4,required"`
   307  
   308  	// Nested fields.  Since thrift does not support nested fields,
   309  	// the nesting is flattened to a single list by a depth-first traversal.
   310  	// The children count is used to construct the nested relationship.
   311  	// This field is not set when the element is a primitive type
   312  	NumChildren int32 `thrift:"5,optional"`
   313  
   314  	// DEPRECATED: When the schema is the result of a conversion from another model.
   315  	// Used to record the original type to help with cross conversion.
   316  	//
   317  	// This is superseded by logicalType.
   318  	ConvertedType *deprecated.ConvertedType `thrift:"6,optional"`
   319  
   320  	// DEPRECATED: Used when this column contains decimal data.
   321  	// See the DECIMAL converted type for more details.
   322  	//
   323  	// This is superseded by using the DecimalType annotation in logicalType.
   324  	Scale     *int32 `thrift:"7,optional"`
   325  	Precision *int32 `thrift:"8,optional"`
   326  
   327  	// When the original schema supports field ids, this will save the
   328  	// original field id in the parquet schema.
   329  	FieldID int32 `thrift:"9,optional"`
   330  
   331  	// The logical type of this SchemaElement
   332  	//
   333  	// LogicalType replaces ConvertedType, but ConvertedType is still required
   334  	// for some logical types to ensure forward-compatibility in format v1.
   335  	LogicalType *LogicalType `thrift:"10,optional"`
   336  }
   337  
   338  // Encodings supported by Parquet. Not all encodings are valid for all types.
   339  // These enums are also used to specify the encoding of definition and
   340  // repetition levels. See the accompanying doc for the details of the more
   341  // complicated encodings.
   342  type Encoding int32
   343  
   344  const (
   345  	// Default encoding.
   346  	// Boolean - 1 bit per value. 0 is false; 1 is true.
   347  	// Int32 - 4 bytes per value. Stored as little-endian.
   348  	// Int64 - 8 bytes per value. Stored as little-endian.
   349  	// Float - 4 bytes per value. IEEE. Stored as little-endian.
   350  	// Double - 8 bytes per value. IEEE. Stored as little-endian.
   351  	// ByteArray - 4 byte length stored as little endian, followed by bytes.
   352  	// FixedLenByteArray - Just the bytes.
   353  	Plain Encoding = 0
   354  
   355  	// Group VarInt encoding for Int32/Int64.
   356  	// This encoding is deprecated. It was never used.
   357  	// GroupVarInt Encoding = 1
   358  
   359  	// Deprecated: Dictionary encoding. The values in the dictionary are encoded
   360  	// in the plain type.
   361  	// In a data page use RLEDictionary instead.
   362  	// In a Dictionary page use Plain instead.
   363  	PlainDictionary Encoding = 2
   364  
   365  	// Group packed run length encoding. Usable for definition/repetition levels
   366  	// encoding and Booleans (on one bit: 0 is false 1 is true.)
   367  	RLE Encoding = 3
   368  
   369  	// Bit packed encoding. This can only be used if the data has a known max
   370  	// width. Usable for definition/repetition levels encoding.
   371  	BitPacked Encoding = 4
   372  
   373  	// Delta encoding for integers. This can be used for int columns and works best
   374  	// on sorted data.
   375  	DeltaBinaryPacked Encoding = 5
   376  
   377  	// Encoding for byte arrays to separate the length values and the data.
   378  	// The lengths are encoded using DeltaBinaryPacked.
   379  	DeltaLengthByteArray Encoding = 6
   380  
   381  	// Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED.
   382  	// Suffixes are stored as delta length byte arrays.
   383  	DeltaByteArray Encoding = 7
   384  
   385  	// Dictionary encoding: the ids are encoded using the RLE encoding
   386  	RLEDictionary Encoding = 8
   387  
   388  	// Encoding for floating-point data.
   389  	// K byte-streams are created where K is the size in bytes of the data type.
   390  	// The individual bytes of an FP value are scattered to the corresponding stream and
   391  	// the streams are concatenated.
   392  	// This itself does not reduce the size of the data but can lead to better compression
   393  	// afterwards.
   394  	ByteStreamSplit Encoding = 9
   395  )
   396  
   397  func (e Encoding) String() string {
   398  	switch e {
   399  	case Plain:
   400  		return "PLAIN"
   401  	case PlainDictionary:
   402  		return "PLAIN_DICTIONARY"
   403  	case RLE:
   404  		return "RLE"
   405  	case BitPacked:
   406  		return "BIT_PACKED"
   407  	case DeltaBinaryPacked:
   408  		return "DELTA_BINARY_PACKED"
   409  	case DeltaLengthByteArray:
   410  		return "DELTA_LENGTH_BYTE_ARRAY"
   411  	case DeltaByteArray:
   412  		return "DELTA_BYTE_ARRAY"
   413  	case RLEDictionary:
   414  		return "RLE_DICTIONARY"
   415  	case ByteStreamSplit:
   416  		return "BYTE_STREAM_SPLIT"
   417  	default:
   418  		return "Encoding(?)"
   419  	}
   420  }
   421  
   422  // Supported compression algorithms.
   423  //
   424  // Codecs added in format version X.Y can be read by readers based on X.Y and later.
   425  // Codec support may vary between readers based on the format version and
   426  // libraries available at runtime.
   427  //
   428  // See Compression.md for a detailed specification of these algorithms.
   429  type CompressionCodec int32
   430  
   431  const (
   432  	Uncompressed CompressionCodec = 0
   433  	Snappy       CompressionCodec = 1
   434  	Gzip         CompressionCodec = 2
   435  	LZO          CompressionCodec = 3
   436  	Brotli       CompressionCodec = 4 // Added in 2.4
   437  	Lz4          CompressionCodec = 5 // DEPRECATED (Added in 2.4)
   438  	Zstd         CompressionCodec = 6 // Added in 2.4
   439  	Lz4Raw       CompressionCodec = 7 // Added in 2.9
   440  )
   441  
   442  func (c CompressionCodec) String() string {
   443  	switch c {
   444  	case Uncompressed:
   445  		return "UNCOMPRESSED"
   446  	case Snappy:
   447  		return "SNAPPY"
   448  	case Gzip:
   449  		return "GZIP"
   450  	case LZO:
   451  		return "LZO"
   452  	case Brotli:
   453  		return "BROTLI"
   454  	case Lz4:
   455  		return "LZ4"
   456  	case Zstd:
   457  		return "ZSTD"
   458  	case Lz4Raw:
   459  		return "LZ4_RAW"
   460  	default:
   461  		return "CompressionCodec(?)"
   462  	}
   463  }
   464  
   465  type PageType int32
   466  
   467  const (
   468  	DataPage       PageType = 0
   469  	IndexPage      PageType = 1
   470  	DictionaryPage PageType = 2
   471  	// Version 2 is indicated in the PageHeader and the use of DataPageHeaderV2,
   472  	// and allows you to read repetition and definition level data without
   473  	// decompressing the Page.
   474  	DataPageV2 PageType = 3
   475  )
   476  
   477  func (p PageType) String() string {
   478  	switch p {
   479  	case DataPage:
   480  		return "DATA_PAGE"
   481  	case IndexPage:
   482  		return "INDEX_PAGE"
   483  	case DictionaryPage:
   484  		return "DICTIONARY_PAGE"
   485  	case DataPageV2:
   486  		return "DATA_PAGE_V2"
   487  	default:
   488  		return "PageType(?)"
   489  	}
   490  }
   491  
   492  // Enum to annotate whether lists of min/max elements inside ColumnIndex
   493  // are ordered and if so, in which direction.
   494  type BoundaryOrder int32
   495  
   496  const (
   497  	Unordered  BoundaryOrder = 0
   498  	Ascending  BoundaryOrder = 1
   499  	Descending BoundaryOrder = 2
   500  )
   501  
   502  func (b BoundaryOrder) String() string {
   503  	switch b {
   504  	case Unordered:
   505  		return "UNORDERED"
   506  	case Ascending:
   507  		return "ASCENDING"
   508  	case Descending:
   509  		return "DESCENDING"
   510  	default:
   511  		return "BoundaryOrder(?)"
   512  	}
   513  }
   514  
   515  // Data page header.
   516  type DataPageHeader struct {
   517  	// Number of values, including NULLs, in this data page.
   518  	NumValues int32 `thrift:"1,required"`
   519  
   520  	// Encoding used for this data page.
   521  	Encoding Encoding `thrift:"2,required"`
   522  
   523  	// Encoding used for definition levels.
   524  	DefinitionLevelEncoding Encoding `thrift:"3,required"`
   525  
   526  	// Encoding used for repetition levels.
   527  	RepetitionLevelEncoding Encoding `thrift:"4,required"`
   528  
   529  	// Optional statistics for the data in this page.
   530  	Statistics Statistics `thrift:"5,optional"`
   531  }
   532  
   533  type IndexPageHeader struct {
   534  	// TODO
   535  }
   536  
   537  // The dictionary page must be placed at the first position of the column chunk
   538  // if it is partly or completely dictionary encoded. At most one dictionary page
   539  // can be placed in a column chunk.
   540  type DictionaryPageHeader struct {
   541  	// Number of values in the dictionary.
   542  	NumValues int32 `thrift:"1,required"`
   543  
   544  	// Encoding using this dictionary page.
   545  	Encoding Encoding `thrift:"2,required"`
   546  
   547  	// If true, the entries in the dictionary are sorted in ascending order.
   548  	IsSorted bool `thrift:"3,optional"`
   549  }
   550  
   551  // New page format allowing reading levels without decompressing the data
   552  // Repetition and definition levels are uncompressed
   553  // The remaining section containing the data is compressed if is_compressed is
   554  // true.
   555  type DataPageHeaderV2 struct {
   556  	// Number of values, including NULLs, in this data page.
   557  	NumValues int32 `thrift:"1,required"`
   558  	// Number of NULL values, in this data page.
   559  	// Number of non-null = num_values - num_nulls which is also the number of
   560  	// values in the data section.
   561  	NumNulls int32 `thrift:"2,required"`
   562  	// Number of rows in this data page. which means pages change on record boundaries (r = 0).
   563  	NumRows int32 `thrift:"3,required"`
   564  	// Encoding used for data in this page.
   565  	Encoding Encoding `thrift:"4,required"`
   566  
   567  	// Repetition levels and definition levels are always using RLE (without size in it).
   568  
   569  	// Length of the definition levels.
   570  	DefinitionLevelsByteLength int32 `thrift:"5,required"`
   571  	// Length of the repetition levels.
   572  	RepetitionLevelsByteLength int32 `thrift:"6,required"`
   573  
   574  	// Whether the values are compressed.
   575  	// Which means the section of the page between
   576  	// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
   577  	// is compressed with the compression_codec.
   578  	// If missing it is considered compressed.
   579  	IsCompressed *bool `thrift:"7,optional"`
   580  
   581  	// Optional statistics for the data in this page.
   582  	Statistics Statistics `thrift:"8,optional"`
   583  }
   584  
   585  // Block-based algorithm type annotation.
   586  type SplitBlockAlgorithm struct{}
   587  
   588  // The algorithm used in Bloom filter.
   589  type BloomFilterAlgorithm struct { // union
   590  	Block *SplitBlockAlgorithm `thrift:"1"`
   591  }
   592  
   593  // Hash strategy type annotation. xxHash is an extremely fast non-cryptographic
   594  // hash algorithm. It uses 64 bits version of xxHash.
   595  type XxHash struct{}
   596  
   597  // The hash function used in Bloom filter. This function takes the hash of a
   598  // column value using plain encoding.
   599  type BloomFilterHash struct { // union
   600  	XxHash *XxHash `thrift:"1"`
   601  }
   602  
   603  // The compression used in the Bloom filter.
   604  type BloomFilterUncompressed struct{}
   605  type BloomFilterCompression struct { // union
   606  	Uncompressed *BloomFilterUncompressed `thrift:"1"`
   607  }
   608  
   609  // Bloom filter header is stored at beginning of Bloom filter data of each column
   610  // and followed by its bitset.
   611  type BloomFilterHeader struct {
   612  	// The size of bitset in bytes.
   613  	NumBytes int32 `thrift:"1,required"`
   614  	// The algorithm for setting bits.
   615  	Algorithm BloomFilterAlgorithm `thrift:"2,required"`
   616  	// The hash function used for Bloom filter.
   617  	Hash BloomFilterHash `thrift:"3,required"`
   618  	// The compression used in the Bloom filter.
   619  	Compression BloomFilterCompression `thrift:"4,required"`
   620  }
   621  
   622  type PageHeader struct {
   623  	// The type of the page indicates which of the *Header fields below is set.
   624  	Type PageType `thrift:"1,required"`
   625  
   626  	// Uncompressed page size in bytes (not including this header).
   627  	UncompressedPageSize int32 `thrift:"2,required"`
   628  
   629  	// Compressed (and potentially encrypted) page size in bytes, not including
   630  	// this header.
   631  	CompressedPageSize int32 `thrift:"3,required"`
   632  
   633  	// The 32bit CRC for the page, to be be calculated as follows:
   634  	// - Using the standard CRC32 algorithm
   635  	// - On the data only, i.e. this header should not be included. 'Data'
   636  	//   hereby refers to the concatenation of the repetition levels, the
   637  	//   definition levels and the column value, in this exact order.
   638  	// - On the encoded versions of the repetition levels, definition levels and
   639  	//   column values.
   640  	// - On the compressed versions of the repetition levels, definition levels
   641  	//   and column values where possible;
   642  	//   - For v1 data pages, the repetition levels, definition levels and column
   643  	//     values are always compressed together. If a compression scheme is
   644  	//     specified, the CRC shall be calculated on the compressed version of
   645  	//     this concatenation. If no compression scheme is specified, the CRC
   646  	//     shall be calculated on the uncompressed version of this concatenation.
   647  	//   - For v2 data pages, the repetition levels and definition levels are
   648  	//     handled separately from the data and are never compressed (only
   649  	//     encoded). If a compression scheme is specified, the CRC shall be
   650  	//     calculated on the concatenation of the uncompressed repetition levels,
   651  	//     uncompressed definition levels and the compressed column values.
   652  	//     If no compression scheme is specified, the CRC shall be calculated on
   653  	//     the uncompressed concatenation.
   654  	// - In encrypted columns, CRC is calculated after page encryption; the
   655  	//   encryption itself is performed after page compression (if compressed)
   656  	// If enabled, this allows for disabling checksumming in HDFS if only a few
   657  	// pages need to be read.
   658  	CRC int32 `thrift:"4,optional"`
   659  
   660  	// Headers for page specific data. One only will be set.
   661  	DataPageHeader       *DataPageHeader       `thrift:"5,optional"`
   662  	IndexPageHeader      *IndexPageHeader      `thrift:"6,optional"`
   663  	DictionaryPageHeader *DictionaryPageHeader `thrift:"7,optional"`
   664  	DataPageHeaderV2     *DataPageHeaderV2     `thrift:"8,optional"`
   665  }
   666  
   667  // Wrapper struct to store key values.
   668  type KeyValue struct {
   669  	Key   string `thrift:"1,required"`
   670  	Value string `thrift:"2,required"`
   671  }
   672  
   673  // Wrapper struct to specify sort order.
   674  type SortingColumn struct {
   675  	// The column index (in this row group)
   676  	ColumnIdx int32 `thrift:"1,required"`
   677  
   678  	// If true, indicates this column is sorted in descending order.
   679  	Descending bool `thrift:"2,required"`
   680  
   681  	// If true, nulls will come before non-null values, otherwise,
   682  	// nulls go at the end.
   683  	NullsFirst bool `thrift:"3,required"`
   684  }
   685  
   686  // Statistics of a given page type and encoding.
   687  type PageEncodingStats struct {
   688  	// The page type (data/dic/...).
   689  	PageType PageType `thrift:"1,required"`
   690  
   691  	// Encoding of the page.
   692  	Encoding Encoding `thrift:"2,required"`
   693  
   694  	// Number of pages of this type with this encoding.
   695  	Count int32 `thrift:"3,required"`
   696  }
   697  
   698  // Description for column metadata.
   699  type ColumnMetaData struct {
   700  	// Type of this column.
   701  	Type Type `thrift:"1,required"`
   702  
   703  	// Set of all encodings used for this column. The purpose is to validate
   704  	// whether we can decode those pages.
   705  	Encoding []Encoding `thrift:"2,required"`
   706  
   707  	// Path in schema.
   708  	PathInSchema []string `thrift:"3,required"`
   709  
   710  	// Compression codec.
   711  	Codec CompressionCodec `thrift:"4,required"`
   712  
   713  	// Number of values in this column.
   714  	NumValues int64 `thrift:"5,required"`
   715  
   716  	// Total byte size of all uncompressed pages in this column chunk (including the headers).
   717  	TotalUncompressedSize int64 `thrift:"6,required"`
   718  
   719  	// Total byte size of all compressed, and potentially encrypted, pages
   720  	// in this column chunk (including the headers).
   721  	TotalCompressedSize int64 `thrift:"7,required"`
   722  
   723  	// Optional key/value metadata.
   724  	KeyValueMetadata []KeyValue `thrift:"8,optional"`
   725  
   726  	// Byte offset from beginning of file to first data page.
   727  	DataPageOffset int64 `thrift:"9,required"`
   728  
   729  	// Byte offset from beginning of file to root index page.
   730  	IndexPageOffset int64 `thrift:"10,optional"`
   731  
   732  	// Byte offset from the beginning of file to first (only) dictionary page.
   733  	DictionaryPageOffset int64 `thrift:"11,optional"`
   734  
   735  	// optional statistics for this column chunk.
   736  	Statistics Statistics `thrift:"12,optional"`
   737  
   738  	// Set of all encodings used for pages in this column chunk.
   739  	// This information can be used to determine if all data pages are
   740  	// dictionary encoded for example.
   741  	EncodingStats []PageEncodingStats `thrift:"13,optional"`
   742  
   743  	// Byte offset from beginning of file to Bloom filter data.
   744  	BloomFilterOffset int64 `thrift:"14,optional"`
   745  }
   746  
   747  type EncryptionWithFooterKey struct{}
   748  
   749  type EncryptionWithColumnKey struct {
   750  	// Column path in schema.
   751  	PathInSchema []string `thrift:"1,required"`
   752  
   753  	// Retrieval metadata of column encryption key.
   754  	KeyMetadata []byte `thrift:"2,optional"`
   755  }
   756  
   757  type ColumnCryptoMetaData struct {
   758  	EncryptionWithFooterKey *EncryptionWithFooterKey `thrift:"1"`
   759  	EncryptionWithColumnKey *EncryptionWithColumnKey `thrift:"2"`
   760  }
   761  
   762  type ColumnChunk struct {
   763  	// File where column data is stored.  If not set, assumed to be same file as
   764  	// metadata.  This path is relative to the current file.
   765  	FilePath string `thrift:"1,optional"`
   766  
   767  	// Byte offset in file_path to the ColumnMetaData.
   768  	FileOffset int64 `thrift:"2,required"`
   769  
   770  	// Column metadata for this chunk. This is the same content as what is at
   771  	// file_path/file_offset. Having it here has it replicated in the file
   772  	// metadata.
   773  	MetaData ColumnMetaData `thrift:"3,optional"`
   774  
   775  	// File offset of ColumnChunk's OffsetIndex.
   776  	OffsetIndexOffset int64 `thrift:"4,optional"`
   777  
   778  	// Size of ColumnChunk's OffsetIndex, in bytes.
   779  	OffsetIndexLength int32 `thrift:"5,optional"`
   780  
   781  	// File offset of ColumnChunk's ColumnIndex.
   782  	ColumnIndexOffset int64 `thrift:"6,optional"`
   783  
   784  	// Size of ColumnChunk's ColumnIndex, in bytes.
   785  	ColumnIndexLength int32 `thrift:"7,optional"`
   786  
   787  	// Crypto metadata of encrypted columns.
   788  	CryptoMetadata ColumnCryptoMetaData `thrift:"8,optional"`
   789  
   790  	// Encrypted column metadata for this chunk.
   791  	EncryptedColumnMetadata []byte `thrift:"9,optional"`
   792  }
   793  
   794  type RowGroup struct {
   795  	// Metadata for each column chunk in this row group.
   796  	// This list must have the same order as the SchemaElement list in FileMetaData.
   797  	Columns []ColumnChunk `thrift:"1,required"`
   798  
   799  	// Total byte size of all the uncompressed column data in this row group.
   800  	TotalByteSize int64 `thrift:"2,required"`
   801  
   802  	// Number of rows in this row group.
   803  	NumRows int64 `thrift:"3,required"`
   804  
   805  	// If set, specifies a sort ordering of the rows in this RowGroup.
   806  	// The sorting columns can be a subset of all the columns.
   807  	SortingColumns []SortingColumn `thrift:"4,optional"`
   808  
   809  	// Byte offset from beginning of file to first page (data or dictionary)
   810  	// in this row group
   811  	FileOffset int64 `thrift:"5,optional"`
   812  
   813  	// Total byte size of all compressed (and potentially encrypted) column data
   814  	// in this row group.
   815  	TotalCompressedSize int64 `thrift:"6,optional"`
   816  
   817  	// Row group ordinal in the file.
   818  	Ordinal int16 `thrift:"7,optional"`
   819  }
   820  
   821  // Empty struct to signal the order defined by the physical or logical type.
   822  type TypeDefinedOrder struct{}
   823  
   824  // Union to specify the order used for the min_value and max_value fields for a
   825  // column. This union takes the role of an enhanced enum that allows rich
   826  // elements (which will be needed for a collation-based ordering in the future).
   827  //
   828  // Possible values are:
   829  //
   830  //	TypeDefinedOrder - the column uses the order defined by its logical or
   831  //	                   physical type (if there is no logical type).
   832  //
   833  // If the reader does not support the value of this union, min and max stats
   834  // for this column should be ignored.
   835  type ColumnOrder struct { // union
   836  	// The sort orders for logical types are:
   837  	//   UTF8 - unsigned byte-wise comparison
   838  	//   INT8 - signed comparison
   839  	//   INT16 - signed comparison
   840  	//   INT32 - signed comparison
   841  	//   INT64 - signed comparison
   842  	//   UINT8 - unsigned comparison
   843  	//   UINT16 - unsigned comparison
   844  	//   UINT32 - unsigned comparison
   845  	//   UINT64 - unsigned comparison
   846  	//   DECIMAL - signed comparison of the represented value
   847  	//   DATE - signed comparison
   848  	//   TIME_MILLIS - signed comparison
   849  	//   TIME_MICROS - signed comparison
   850  	//   TIMESTAMP_MILLIS - signed comparison
   851  	//   TIMESTAMP_MICROS - signed comparison
   852  	//   INTERVAL - unsigned comparison
   853  	//   JSON - unsigned byte-wise comparison
   854  	//   BSON - unsigned byte-wise comparison
   855  	//   ENUM - unsigned byte-wise comparison
   856  	//   LIST - undefined
   857  	//   MAP - undefined
   858  	//
   859  	// In the absence of logical types, the sort order is determined by the physical type:
   860  	//   BOOLEAN - false, true
   861  	//   INT32 - signed comparison
   862  	//   INT64 - signed comparison
   863  	//   INT96 (only used for legacy timestamps) - undefined
   864  	//   FLOAT - signed comparison of the represented value (*)
   865  	//   DOUBLE - signed comparison of the represented value (*)
   866  	//   BYTE_ARRAY - unsigned byte-wise comparison
   867  	//   FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
   868  	//
   869  	// (*) Because the sorting order is not specified properly for floating
   870  	//     point values (relations vs. total ordering) the following
   871  	//     compatibility rules should be applied when reading statistics:
   872  	//     - If the min is a NaN, it should be ignored.
   873  	//     - If the max is a NaN, it should be ignored.
   874  	//     - If the min is +0, the row group may contain -0 values as well.
   875  	//     - If the max is -0, the row group may contain +0 values as well.
   876  	//     - When looking for NaN values, min and max should be ignored.
   877  	TypeOrder *TypeDefinedOrder `thrift:"1"`
   878  }
   879  
   880  type PageLocation struct {
   881  	// Offset of the page in the file.
   882  	Offset int64 `thrift:"1,required"`
   883  
   884  	// Size of the page, including header. Sum of compressed_page_size and
   885  	// header length.
   886  	CompressedPageSize int32 `thrift:"2,required"`
   887  
   888  	// Index within the RowGroup of the first row of the page; this means
   889  	// pages change on record boundaries (r = 0).
   890  	FirstRowIndex int64 `thrift:"3,required"`
   891  }
   892  
   893  type OffsetIndex struct {
   894  	// PageLocations, ordered by increasing PageLocation.offset. It is required
   895  	// that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
   896  	PageLocations []PageLocation `thrift:"1,required"`
   897  }
   898  
   899  // Description for ColumnIndex.
   900  // Each <array-field>[i] refers to the page at OffsetIndex.PageLocations[i]
   901  type ColumnIndex struct {
   902  	// A list of Boolean values to determine the validity of the corresponding
   903  	// min and max values. If true, a page contains only null values, and writers
   904  	// have to set the corresponding entries in min_values and max_values to
   905  	// byte[0], so that all lists have the same length. If false, the
   906  	// corresponding entries in min_values and max_values must be valid.
   907  	NullPages []bool `thrift:"1,required"`
   908  
   909  	// Two lists containing lower and upper bounds for the values of each page
   910  	// determined by the ColumnOrder of the column. These may be the actual
   911  	// minimum and maximum values found on a page, but can also be (more compact)
   912  	// values that do not exist on a page. For example, instead of storing ""Blart
   913  	// Versenwald III", a writer may set min_values[i]="B", max_values[i]="C".
   914  	// Such more compact values must still be valid values within the column's
   915  	// logical type. Readers must make sure that list entries are populated before
   916  	// using them by inspecting null_pages.
   917  	MinValues [][]byte `thrift:"2,required"`
   918  	MaxValues [][]byte `thrift:"3,required"`
   919  
   920  	// Stores whether both min_values and max_values are ordered and if so, in
   921  	// which direction. This allows readers to perform binary searches in both
   922  	// lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
   923  	// if the lists are ordered.
   924  	BoundaryOrder BoundaryOrder `thrift:"4,required"`
   925  
   926  	// A list containing the number of null values for each page.
   927  	NullCounts []int64 `thrift:"5,optional"`
   928  }
   929  
   930  type AesGcmV1 struct {
   931  	// AAD prefix.
   932  	AadPrefix []byte `thrift:"1,optional"`
   933  
   934  	// Unique file identifier part of AAD suffix.
   935  	AadFileUnique []byte `thrift:"2,optional"`
   936  
   937  	// In files encrypted with AAD prefix without storing it,
   938  	// readers must supply the prefix.
   939  	SupplyAadPrefix bool `thrift:"3,optional"`
   940  }
   941  
   942  type AesGcmCtrV1 struct {
   943  	// AAD prefix.
   944  	AadPrefix []byte `thrift:"1,optional"`
   945  
   946  	// Unique file identifier part of AAD suffix.
   947  	AadFileUnique []byte `thrift:"2,optional"`
   948  
   949  	// In files encrypted with AAD prefix without storing it,
   950  	// readers must supply the prefix.
   951  	SupplyAadPrefix bool `thrift:"3,optional"`
   952  }
   953  
   954  type EncryptionAlgorithm struct { // union
   955  	AesGcmV1    *AesGcmV1    `thrift:"1"`
   956  	AesGcmCtrV1 *AesGcmCtrV1 `thrift:"2"`
   957  }
   958  
   959  // Description for file metadata.
   960  type FileMetaData struct {
   961  	// Version of this file.
   962  	Version int32 `thrift:"1,required"`
   963  
   964  	// Parquet schema for this file.  This schema contains metadata for all the columns.
   965  	// The schema is represented as a tree with a single root.  The nodes of the tree
   966  	// are flattened to a list by doing a depth-first traversal.
   967  	// The column metadata contains the path in the schema for that column which can be
   968  	// used to map columns to nodes in the schema.
   969  	// The first element is the root.
   970  	Schema []SchemaElement `thrift:"2,required"`
   971  
   972  	// Number of rows in this file.
   973  	NumRows int64 `thrift:"3,required"`
   974  
   975  	// Row groups in this file.
   976  	RowGroups []RowGroup `thrift:"4,required"`
   977  
   978  	// Optional key/value metadata.
   979  	KeyValueMetadata []KeyValue `thrift:"5,optional"`
   980  
   981  	// String for application that wrote this file.  This should be in the format
   982  	// <Application> version <App Version> (build <App Build Hash>).
   983  	// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
   984  	CreatedBy string `thrift:"6,optional"`
   985  
   986  	// Sort order used for the min_value and max_value fields in the Statistics
   987  	// objects and the min_values and max_values fields in the ColumnIndex
   988  	// objects of each column in this file. Sort orders are listed in the order
   989  	// matching the columns in the schema. The indexes are not necessary the same
   990  	// though, because only leaf nodes of the schema are represented in the list
   991  	// of sort orders.
   992  	//
   993  	// Without column_orders, the meaning of the min_value and max_value fields
   994  	// in the Statistics object and the ColumnIndex object is undefined. To ensure
   995  	// well-defined behavior, if these fields are written to a Parquet file,
   996  	// column_orders must be written as well.
   997  	//
   998  	// The obsolete min and max fields in the Statistics object are always sorted
   999  	// by signed comparison regardless of column_orders.
  1000  	ColumnOrders []ColumnOrder `thrift:"7,optional"`
  1001  
  1002  	// Encryption algorithm. This field is set only in encrypted files
  1003  	// with plaintext footer. Files with encrypted footer store algorithm id
  1004  	// in FileCryptoMetaData structure.
  1005  	EncryptionAlgorithm EncryptionAlgorithm `thrift:"8,optional"`
  1006  
  1007  	// Retrieval metadata of key used for signing the footer.
  1008  	// Used only in encrypted files with plaintext footer.
  1009  	FooterSigningKeyMetadata []byte `thrift:"9,optional"`
  1010  }
  1011  
  1012  // Crypto metadata for files with encrypted footer.
  1013  type FileCryptoMetaData struct {
  1014  	// Encryption algorithm. This field is only used for files
  1015  	// with encrypted footer. Files with plaintext footer store algorithm id
  1016  	// inside footer (FileMetaData structure).
  1017  	EncryptionAlgorithm EncryptionAlgorithm `thrift:"1,required"`
  1018  
  1019  	// Retrieval metadata of key used for encryption of footer,
  1020  	// and (possibly) columns.
  1021  	KeyMetadata []byte `thrift:"2,optional"`
  1022  }