github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/format/parquet.go (about)

     1  package format
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/parquet-go/parquet-go/deprecated"
     7  )
     8  
     9  // Types supported by Parquet. These types are intended to be used in combination
    10  // with the encodings to control the on disk storage format. For example INT16
    11  // is not included as a type since a good encoding of INT32 would handle this.
    12  type Type int32
    13  
    14  const (
    15  	Boolean           Type = 0
    16  	Int32             Type = 1
    17  	Int64             Type = 2
    18  	Int96             Type = 3 // deprecated, only used by legacy implementations.
    19  	Float             Type = 4
    20  	Double            Type = 5
    21  	ByteArray         Type = 6
    22  	FixedLenByteArray Type = 7
    23  )
    24  
    25  func (t Type) String() string {
    26  	switch t {
    27  	case Boolean:
    28  		return "BOOLEAN"
    29  	case Int32:
    30  		return "INT32"
    31  	case Int64:
    32  		return "INT64"
    33  	case Int96:
    34  		return "INT96"
    35  	case Float:
    36  		return "FLOAT"
    37  	case Double:
    38  		return "DOUBLE"
    39  	case ByteArray:
    40  		return "BYTE_ARRAY"
    41  	case FixedLenByteArray:
    42  		return "FIXED_LEN_BYTE_ARRAY"
    43  	default:
    44  		return "Type(?)"
    45  	}
    46  }
    47  
    48  // Representation of Schemas.
    49  type FieldRepetitionType int32
    50  
    51  const (
    52  	// The field is required (can not be null) and each record has exactly 1 value.
    53  	Required FieldRepetitionType = 0
    54  	// The field is optional (can be null) and each record has 0 or 1 values.
    55  	Optional FieldRepetitionType = 1
    56  	// The field is repeated and can contain 0 or more values.
    57  	Repeated FieldRepetitionType = 2
    58  )
    59  
    60  func (t FieldRepetitionType) String() string {
    61  	switch t {
    62  	case Required:
    63  		return "REQUIRED"
    64  	case Optional:
    65  		return "OPTIONAL"
    66  	case Repeated:
    67  		return "REPEATED"
    68  	default:
    69  		return "FieldRepeationaType(?)"
    70  	}
    71  }
    72  
    73  // Statistics per row group and per page.
    74  // All fields are optional.
    75  type Statistics struct {
    76  	// DEPRECATED: min and max value of the column. Use min_value and max_value.
    77  	//
    78  	// Values are encoded using PLAIN encoding, except that variable-length byte
    79  	// arrays do not include a length prefix.
    80  	//
    81  	// These fields encode min and max values determined by signed comparison
    82  	// only. New files should use the correct order for a column's logical type
    83  	// and store the values in the min_value and max_value fields.
    84  	//
    85  	// To support older readers, these may be set when the column order is
    86  	// signed.
    87  	Max []byte `thrift:"1"`
    88  	Min []byte `thrift:"2"`
    89  	// Count of null value in the column.
    90  	NullCount int64 `thrift:"3"`
    91  	// Count of distinct values occurring.
    92  	DistinctCount int64 `thrift:"4"`
    93  	// Min and max values for the column, determined by its ColumnOrder.
    94  	//
    95  	// Values are encoded using PLAIN encoding, except that variable-length byte
    96  	// arrays do not include a length prefix.
    97  	MaxValue []byte `thrift:"5"`
    98  	MinValue []byte `thrift:"6"`
    99  }
   100  
   101  // Empty structs to use as logical type annotations.
   102  type StringType struct{} // allowed for BINARY, must be encoded with UTF-8
   103  type UUIDType struct{}   // allowed for FIXED[16], must encode raw UUID bytes
   104  type MapType struct{}    // see see LogicalTypes.md
   105  type ListType struct{}   // see LogicalTypes.md
   106  type EnumType struct{}   // allowed for BINARY, must be encoded with UTF-8
   107  type DateType struct{}   // allowed for INT32
   108  
   109  func (*StringType) String() string { return "STRING" }
   110  func (*UUIDType) String() string   { return "UUID" }
   111  func (*MapType) String() string    { return "MAP" }
   112  func (*ListType) String() string   { return "LIST" }
   113  func (*EnumType) String() string   { return "ENUM" }
   114  func (*DateType) String() string   { return "DATE" }
   115  
   116  // Logical type to annotate a column that is always null.
   117  //
   118  // Sometimes when discovering the schema of existing data, values are always
   119  // null and the physical type can't be determined. This annotation signals
   120  // the case where the physical type was guessed from all null values.
   121  type NullType struct{}
   122  
   123  func (*NullType) String() string { return "NULL" }
   124  
   125  // Decimal logical type annotation
   126  //
   127  // To maintain forward-compatibility in v1, implementations using this logical
   128  // type must also set scale and precision on the annotated SchemaElement.
   129  //
   130  // Allowed for physical types: INT32, INT64, FIXED, and BINARY
   131  type DecimalType struct {
   132  	Scale     int32 `thrift:"1,required"`
   133  	Precision int32 `thrift:"2,required"`
   134  }
   135  
   136  func (t *DecimalType) String() string {
   137  	// Matching parquet-cli's decimal string format: https://github.com/apache/parquet-mr/blob/d057b39d93014fe40f5067ee4a33621e65c91552/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java#L249-L265
   138  	return fmt.Sprintf("DECIMAL(%d,%d)", t.Precision, t.Scale)
   139  }
   140  
   141  // Time units for logical types.
   142  type MilliSeconds struct{}
   143  type MicroSeconds struct{}
   144  type NanoSeconds struct{}
   145  
   146  func (*MilliSeconds) String() string { return "MILLIS" }
   147  func (*MicroSeconds) String() string { return "MICROS" }
   148  func (*NanoSeconds) String() string  { return "NANOS" }
   149  
   150  type TimeUnit struct { // union
   151  	Millis *MilliSeconds `thrift:"1"`
   152  	Micros *MicroSeconds `thrift:"2"`
   153  	Nanos  *NanoSeconds  `thrift:"3"`
   154  }
   155  
   156  func (u *TimeUnit) String() string {
   157  	switch {
   158  	case u.Millis != nil:
   159  		return u.Millis.String()
   160  	case u.Micros != nil:
   161  		return u.Micros.String()
   162  	case u.Nanos != nil:
   163  		return u.Nanos.String()
   164  	default:
   165  		return ""
   166  	}
   167  }
   168  
   169  // Timestamp logical type annotation
   170  //
   171  // Allowed for physical types: INT64
   172  type TimestampType struct {
   173  	IsAdjustedToUTC bool     `thrift:"1,required"`
   174  	Unit            TimeUnit `thrift:"2,required"`
   175  }
   176  
   177  func (t *TimestampType) String() string {
   178  	return fmt.Sprintf("TIMESTAMP(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit)
   179  }
   180  
   181  // Time logical type annotation
   182  //
   183  // Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
   184  type TimeType struct {
   185  	IsAdjustedToUTC bool     `thrift:"1,required"`
   186  	Unit            TimeUnit `thrift:"2,required"`
   187  }
   188  
   189  func (t *TimeType) String() string {
   190  	return fmt.Sprintf("TIME(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit)
   191  }
   192  
   193  // Integer logical type annotation
   194  //
   195  // bitWidth must be 8, 16, 32, or 64.
   196  //
   197  // Allowed for physical types: INT32, INT64
   198  type IntType struct {
   199  	BitWidth int8 `thrift:"1,required"`
   200  	IsSigned bool `thrift:"2,required"`
   201  }
   202  
   203  func (t *IntType) String() string {
   204  	return fmt.Sprintf("INT(%d,%t)", t.BitWidth, t.IsSigned)
   205  }
   206  
   207  // Embedded JSON logical type annotation
   208  //
   209  // Allowed for physical types: BINARY
   210  type JsonType struct{}
   211  
   212  func (t *JsonType) String() string { return "JSON" }
   213  
   214  // Embedded BSON logical type annotation
   215  //
   216  // Allowed for physical types: BINARY
   217  type BsonType struct{}
   218  
   219  func (t *BsonType) String() string { return "BSON" }
   220  
   221  // LogicalType annotations to replace ConvertedType.
   222  //
   223  // To maintain compatibility, implementations using LogicalType for a
   224  // SchemaElement must also set the corresponding ConvertedType (if any)
   225  // from the following table.
   226  type LogicalType struct { // union
   227  	UTF8    *StringType  `thrift:"1"` // use ConvertedType UTF8
   228  	Map     *MapType     `thrift:"2"` // use ConvertedType Map
   229  	List    *ListType    `thrift:"3"` // use ConvertedType List
   230  	Enum    *EnumType    `thrift:"4"` // use ConvertedType Enum
   231  	Decimal *DecimalType `thrift:"5"` // use ConvertedType Decimal + SchemaElement.{Scale, Precision}
   232  	Date    *DateType    `thrift:"6"` // use ConvertedType Date
   233  
   234  	// use ConvertedType TimeMicros for Time{IsAdjustedToUTC: *, Unit: Micros}
   235  	// use ConvertedType TimeMillis for Time{IsAdjustedToUTC: *, Unit: Millis}
   236  	Time *TimeType `thrift:"7"`
   237  
   238  	// use ConvertedType TimestampMicros for Timestamp{IsAdjustedToUTC: *, Unit: Micros}
   239  	// use ConvertedType TimestampMillis for Timestamp{IsAdjustedToUTC: *, Unit: Millis}
   240  	Timestamp *TimestampType `thrift:"8"`
   241  
   242  	// 9: reserved for Interval
   243  	Integer *IntType  `thrift:"10"` // use ConvertedType Int* or Uint*
   244  	Unknown *NullType `thrift:"11"` // no compatible ConvertedType
   245  	Json    *JsonType `thrift:"12"` // use ConvertedType JSON
   246  	Bson    *BsonType `thrift:"13"` // use ConvertedType BSON
   247  	UUID    *UUIDType `thrift:"14"` // no compatible ConvertedType
   248  }
   249  
   250  func (t *LogicalType) String() string {
   251  	switch {
   252  	case t.UTF8 != nil:
   253  		return t.UTF8.String()
   254  	case t.Map != nil:
   255  		return t.Map.String()
   256  	case t.List != nil:
   257  		return t.List.String()
   258  	case t.Enum != nil:
   259  		return t.Enum.String()
   260  	case t.Decimal != nil:
   261  		return t.Decimal.String()
   262  	case t.Date != nil:
   263  		return t.Date.String()
   264  	case t.Time != nil:
   265  		return t.Time.String()
   266  	case t.Timestamp != nil:
   267  		return t.Timestamp.String()
   268  	case t.Integer != nil:
   269  		return t.Integer.String()
   270  	case t.Unknown != nil:
   271  		return t.Unknown.String()
   272  	case t.Json != nil:
   273  		return t.Json.String()
   274  	case t.Bson != nil:
   275  		return t.Bson.String()
   276  	case t.UUID != nil:
   277  		return t.UUID.String()
   278  	default:
   279  		return ""
   280  	}
   281  }
   282  
   283  // Represents a element inside a schema definition.
   284  //
   285  //   - if it is a group (inner node) then type is undefined and num_children is
   286  //     defined
   287  //
   288  //   - if it is a primitive type (leaf) then type is defined and num_children is
   289  //     undefined
   290  //
   291  // The nodes are listed in depth first traversal order.
   292  type SchemaElement struct {
   293  	// Data type for this field. Not set if the current element is a non-leaf node.
   294  	Type *Type `thrift:"1,optional"`
   295  
   296  	// If type is FixedLenByteArray, this is the byte length of the values.
   297  	// Otherwise, if specified, this is the maximum bit length to store any of the values.
   298  	// (e.g. a low cardinality INT col could have this set to 3).  Note that this is
   299  	// in the schema, and therefore fixed for the entire file.
   300  	TypeLength *int32 `thrift:"2,optional"`
   301  
   302  	// repetition of the field. The root of the schema does not have a repetition_type.
   303  	// All other nodes must have one.
   304  	RepetitionType *FieldRepetitionType `thrift:"3,optional"`
   305  
   306  	// Name of the field in the schema.
   307  	Name string `thrift:"4,required"`
   308  
   309  	// Nested fields.  Since thrift does not support nested fields,
   310  	// the nesting is flattened to a single list by a depth-first traversal.
   311  	// The children count is used to construct the nested relationship.
   312  	// This field is not set when the element is a primitive type
   313  	NumChildren int32 `thrift:"5,optional"`
   314  
   315  	// DEPRECATED: When the schema is the result of a conversion from another model.
   316  	// Used to record the original type to help with cross conversion.
   317  	//
   318  	// This is superseded by logicalType.
   319  	ConvertedType *deprecated.ConvertedType `thrift:"6,optional"`
   320  
   321  	// DEPRECATED: Used when this column contains decimal data.
   322  	// See the DECIMAL converted type for more details.
   323  	//
   324  	// This is superseded by using the DecimalType annotation in logicalType.
   325  	Scale     *int32 `thrift:"7,optional"`
   326  	Precision *int32 `thrift:"8,optional"`
   327  
   328  	// When the original schema supports field ids, this will save the
   329  	// original field id in the parquet schema.
   330  	FieldID int32 `thrift:"9,optional"`
   331  
   332  	// The logical type of this SchemaElement
   333  	//
   334  	// LogicalType replaces ConvertedType, but ConvertedType is still required
   335  	// for some logical types to ensure forward-compatibility in format v1.
   336  	LogicalType *LogicalType `thrift:"10,optional"`
   337  }
   338  
   339  // Encodings supported by Parquet. Not all encodings are valid for all types.
   340  // These enums are also used to specify the encoding of definition and
   341  // repetition levels. See the accompanying doc for the details of the more
   342  // complicated encodings.
   343  type Encoding int32
   344  
   345  const (
   346  	// Default encoding.
   347  	// Boolean - 1 bit per value. 0 is false; 1 is true.
   348  	// Int32 - 4 bytes per value. Stored as little-endian.
   349  	// Int64 - 8 bytes per value. Stored as little-endian.
   350  	// Float - 4 bytes per value. IEEE. Stored as little-endian.
   351  	// Double - 8 bytes per value. IEEE. Stored as little-endian.
   352  	// ByteArray - 4 byte length stored as little endian, followed by bytes.
   353  	// FixedLenByteArray - Just the bytes.
   354  	Plain Encoding = 0
   355  
   356  	// Group VarInt encoding for Int32/Int64.
   357  	// This encoding is deprecated. It was never used.
   358  	// GroupVarInt Encoding = 1
   359  
   360  	// Deprecated: Dictionary encoding. The values in the dictionary are encoded
   361  	// in the plain type.
   362  	// In a data page use RLEDictionary instead.
   363  	// In a Dictionary page use Plain instead.
   364  	PlainDictionary Encoding = 2
   365  
   366  	// Group packed run length encoding. Usable for definition/repetition levels
   367  	// encoding and Booleans (on one bit: 0 is false 1 is true.)
   368  	RLE Encoding = 3
   369  
   370  	// Bit packed encoding. This can only be used if the data has a known max
   371  	// width. Usable for definition/repetition levels encoding.
   372  	BitPacked Encoding = 4
   373  
   374  	// Delta encoding for integers. This can be used for int columns and works best
   375  	// on sorted data.
   376  	DeltaBinaryPacked Encoding = 5
   377  
   378  	// Encoding for byte arrays to separate the length values and the data.
   379  	// The lengths are encoded using DeltaBinaryPacked.
   380  	DeltaLengthByteArray Encoding = 6
   381  
   382  	// Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED.
   383  	// Suffixes are stored as delta length byte arrays.
   384  	DeltaByteArray Encoding = 7
   385  
   386  	// Dictionary encoding: the ids are encoded using the RLE encoding
   387  	RLEDictionary Encoding = 8
   388  
   389  	// Encoding for floating-point data.
   390  	// K byte-streams are created where K is the size in bytes of the data type.
   391  	// The individual bytes of an FP value are scattered to the corresponding stream and
   392  	// the streams are concatenated.
   393  	// This itself does not reduce the size of the data but can lead to better compression
   394  	// afterwards.
   395  	ByteStreamSplit Encoding = 9
   396  )
   397  
   398  func (e Encoding) String() string {
   399  	switch e {
   400  	case Plain:
   401  		return "PLAIN"
   402  	case PlainDictionary:
   403  		return "PLAIN_DICTIONARY"
   404  	case RLE:
   405  		return "RLE"
   406  	case BitPacked:
   407  		return "BIT_PACKED"
   408  	case DeltaBinaryPacked:
   409  		return "DELTA_BINARY_PACKED"
   410  	case DeltaLengthByteArray:
   411  		return "DELTA_LENGTH_BYTE_ARRAY"
   412  	case DeltaByteArray:
   413  		return "DELTA_BYTE_ARRAY"
   414  	case RLEDictionary:
   415  		return "RLE_DICTIONARY"
   416  	case ByteStreamSplit:
   417  		return "BYTE_STREAM_SPLIT"
   418  	default:
   419  		return "Encoding(?)"
   420  	}
   421  }
   422  
   423  // Supported compression algorithms.
   424  //
   425  // Codecs added in format version X.Y can be read by readers based on X.Y and later.
   426  // Codec support may vary between readers based on the format version and
   427  // libraries available at runtime.
   428  //
   429  // See Compression.md for a detailed specification of these algorithms.
   430  type CompressionCodec int32
   431  
   432  const (
   433  	Uncompressed CompressionCodec = 0
   434  	Snappy       CompressionCodec = 1
   435  	Gzip         CompressionCodec = 2
   436  	LZO          CompressionCodec = 3
   437  	Brotli       CompressionCodec = 4 // Added in 2.4
   438  	Lz4          CompressionCodec = 5 // DEPRECATED (Added in 2.4)
   439  	Zstd         CompressionCodec = 6 // Added in 2.4
   440  	Lz4Raw       CompressionCodec = 7 // Added in 2.9
   441  )
   442  
   443  func (c CompressionCodec) String() string {
   444  	switch c {
   445  	case Uncompressed:
   446  		return "UNCOMPRESSED"
   447  	case Snappy:
   448  		return "SNAPPY"
   449  	case Gzip:
   450  		return "GZIP"
   451  	case LZO:
   452  		return "LZO"
   453  	case Brotli:
   454  		return "BROTLI"
   455  	case Lz4:
   456  		return "LZ4"
   457  	case Zstd:
   458  		return "ZSTD"
   459  	case Lz4Raw:
   460  		return "LZ4_RAW"
   461  	default:
   462  		return "CompressionCodec(?)"
   463  	}
   464  }
   465  
   466  type PageType int32
   467  
   468  const (
   469  	DataPage       PageType = 0
   470  	IndexPage      PageType = 1
   471  	DictionaryPage PageType = 2
   472  	// Version 2 is indicated in the PageHeader and the use of DataPageHeaderV2,
   473  	// and allows you to read repetition and definition level data without
   474  	// decompressing the Page.
   475  	DataPageV2 PageType = 3
   476  )
   477  
   478  func (p PageType) String() string {
   479  	switch p {
   480  	case DataPage:
   481  		return "DATA_PAGE"
   482  	case IndexPage:
   483  		return "INDEX_PAGE"
   484  	case DictionaryPage:
   485  		return "DICTIONARY_PAGE"
   486  	case DataPageV2:
   487  		return "DATA_PAGE_V2"
   488  	default:
   489  		return "PageType(?)"
   490  	}
   491  }
   492  
   493  // Enum to annotate whether lists of min/max elements inside ColumnIndex
   494  // are ordered and if so, in which direction.
   495  type BoundaryOrder int32
   496  
   497  const (
   498  	Unordered  BoundaryOrder = 0
   499  	Ascending  BoundaryOrder = 1
   500  	Descending BoundaryOrder = 2
   501  )
   502  
   503  func (b BoundaryOrder) String() string {
   504  	switch b {
   505  	case Unordered:
   506  		return "UNORDERED"
   507  	case Ascending:
   508  		return "ASCENDING"
   509  	case Descending:
   510  		return "DESCENDING"
   511  	default:
   512  		return "BoundaryOrder(?)"
   513  	}
   514  }
   515  
   516  // Data page header.
   517  type DataPageHeader struct {
   518  	// Number of values, including NULLs, in this data page.
   519  	NumValues int32 `thrift:"1,required"`
   520  
   521  	// Encoding used for this data page.
   522  	Encoding Encoding `thrift:"2,required"`
   523  
   524  	// Encoding used for definition levels.
   525  	DefinitionLevelEncoding Encoding `thrift:"3,required"`
   526  
   527  	// Encoding used for repetition levels.
   528  	RepetitionLevelEncoding Encoding `thrift:"4,required"`
   529  
   530  	// Optional statistics for the data in this page.
   531  	Statistics Statistics `thrift:"5,optional"`
   532  }
   533  
   534  type IndexPageHeader struct {
   535  	// TODO
   536  }
   537  
   538  // The dictionary page must be placed at the first position of the column chunk
   539  // if it is partly or completely dictionary encoded. At most one dictionary page
   540  // can be placed in a column chunk.
   541  type DictionaryPageHeader struct {
   542  	// Number of values in the dictionary.
   543  	NumValues int32 `thrift:"1,required"`
   544  
   545  	// Encoding using this dictionary page.
   546  	Encoding Encoding `thrift:"2,required"`
   547  
   548  	// If true, the entries in the dictionary are sorted in ascending order.
   549  	IsSorted bool `thrift:"3,optional"`
   550  }
   551  
   552  // New page format allowing reading levels without decompressing the data
   553  // Repetition and definition levels are uncompressed
   554  // The remaining section containing the data is compressed if is_compressed is
   555  // true.
   556  type DataPageHeaderV2 struct {
   557  	// Number of values, including NULLs, in this data page.
   558  	NumValues int32 `thrift:"1,required"`
   559  	// Number of NULL values, in this data page.
   560  	// Number of non-null = num_values - num_nulls which is also the number of
   561  	// values in the data section.
   562  	NumNulls int32 `thrift:"2,required"`
   563  	// Number of rows in this data page. which means pages change on record boundaries (r = 0).
   564  	NumRows int32 `thrift:"3,required"`
   565  	// Encoding used for data in this page.
   566  	Encoding Encoding `thrift:"4,required"`
   567  
   568  	// Repetition levels and definition levels are always using RLE (without size in it).
   569  
   570  	// Length of the definition levels.
   571  	DefinitionLevelsByteLength int32 `thrift:"5,required"`
   572  	// Length of the repetition levels.
   573  	RepetitionLevelsByteLength int32 `thrift:"6,required"`
   574  
   575  	// Whether the values are compressed.
   576  	// Which means the section of the page between
   577  	// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
   578  	// is compressed with the compression_codec.
   579  	// If missing it is considered compressed.
   580  	IsCompressed *bool `thrift:"7,optional"`
   581  
   582  	// Optional statistics for the data in this page.
   583  	Statistics Statistics `thrift:"8,optional"`
   584  }
   585  
   586  // Block-based algorithm type annotation.
   587  type SplitBlockAlgorithm struct{}
   588  
   589  // The algorithm used in Bloom filter.
   590  type BloomFilterAlgorithm struct { // union
   591  	Block *SplitBlockAlgorithm `thrift:"1"`
   592  }
   593  
   594  // Hash strategy type annotation. xxHash is an extremely fast non-cryptographic
   595  // hash algorithm. It uses 64 bits version of xxHash.
   596  type XxHash struct{}
   597  
   598  // The hash function used in Bloom filter. This function takes the hash of a
   599  // column value using plain encoding.
   600  type BloomFilterHash struct { // union
   601  	XxHash *XxHash `thrift:"1"`
   602  }
   603  
   604  // The compression used in the Bloom filter.
   605  type BloomFilterUncompressed struct{}
   606  type BloomFilterCompression struct { // union
   607  	Uncompressed *BloomFilterUncompressed `thrift:"1"`
   608  }
   609  
   610  // Bloom filter header is stored at beginning of Bloom filter data of each column
   611  // and followed by its bitset.
   612  type BloomFilterHeader struct {
   613  	// The size of bitset in bytes.
   614  	NumBytes int32 `thrift:"1,required"`
   615  	// The algorithm for setting bits.
   616  	Algorithm BloomFilterAlgorithm `thrift:"2,required"`
   617  	// The hash function used for Bloom filter.
   618  	Hash BloomFilterHash `thrift:"3,required"`
   619  	// The compression used in the Bloom filter.
   620  	Compression BloomFilterCompression `thrift:"4,required"`
   621  }
   622  
   623  type PageHeader struct {
   624  	// The type of the page indicates which of the *Header fields below is set.
   625  	Type PageType `thrift:"1,required"`
   626  
   627  	// Uncompressed page size in bytes (not including this header).
   628  	UncompressedPageSize int32 `thrift:"2,required"`
   629  
   630  	// Compressed (and potentially encrypted) page size in bytes, not including
   631  	// this header.
   632  	CompressedPageSize int32 `thrift:"3,required"`
   633  
   634  	// The 32bit CRC for the page, to be be calculated as follows:
   635  	// - Using the standard CRC32 algorithm
   636  	// - On the data only, i.e. this header should not be included. 'Data'
   637  	//   hereby refers to the concatenation of the repetition levels, the
   638  	//   definition levels and the column value, in this exact order.
   639  	// - On the encoded versions of the repetition levels, definition levels and
   640  	//   column values.
   641  	// - On the compressed versions of the repetition levels, definition levels
   642  	//   and column values where possible;
   643  	//   - For v1 data pages, the repetition levels, definition levels and column
   644  	//     values are always compressed together. If a compression scheme is
   645  	//     specified, the CRC shall be calculated on the compressed version of
   646  	//     this concatenation. If no compression scheme is specified, the CRC
   647  	//     shall be calculated on the uncompressed version of this concatenation.
   648  	//   - For v2 data pages, the repetition levels and definition levels are
   649  	//     handled separately from the data and are never compressed (only
   650  	//     encoded). If a compression scheme is specified, the CRC shall be
   651  	//     calculated on the concatenation of the uncompressed repetition levels,
   652  	//     uncompressed definition levels and the compressed column values.
   653  	//     If no compression scheme is specified, the CRC shall be calculated on
   654  	//     the uncompressed concatenation.
   655  	// - In encrypted columns, CRC is calculated after page encryption; the
   656  	//   encryption itself is performed after page compression (if compressed)
   657  	// If enabled, this allows for disabling checksumming in HDFS if only a few
   658  	// pages need to be read.
   659  	CRC int32 `thrift:"4,optional"`
   660  
   661  	// Headers for page specific data. One only will be set.
   662  	DataPageHeader       *DataPageHeader       `thrift:"5,optional"`
   663  	IndexPageHeader      *IndexPageHeader      `thrift:"6,optional"`
   664  	DictionaryPageHeader *DictionaryPageHeader `thrift:"7,optional"`
   665  	DataPageHeaderV2     *DataPageHeaderV2     `thrift:"8,optional"`
   666  }
   667  
   668  // Wrapper struct to store key values.
   669  type KeyValue struct {
   670  	Key   string `thrift:"1,required"`
   671  	Value string `thrift:"2,required"`
   672  }
   673  
   674  // Wrapper struct to specify sort order.
   675  type SortingColumn struct {
   676  	// The column index (in this row group)
   677  	ColumnIdx int32 `thrift:"1,required"`
   678  
   679  	// If true, indicates this column is sorted in descending order.
   680  	Descending bool `thrift:"2,required"`
   681  
   682  	// If true, nulls will come before non-null values, otherwise,
   683  	// nulls go at the end.
   684  	NullsFirst bool `thrift:"3,required"`
   685  }
   686  
   687  // Statistics of a given page type and encoding.
   688  type PageEncodingStats struct {
   689  	// The page type (data/dic/...).
   690  	PageType PageType `thrift:"1,required"`
   691  
   692  	// Encoding of the page.
   693  	Encoding Encoding `thrift:"2,required"`
   694  
   695  	// Number of pages of this type with this encoding.
   696  	Count int32 `thrift:"3,required"`
   697  }
   698  
   699  // Description for column metadata.
   700  type ColumnMetaData struct {
   701  	// Type of this column.
   702  	Type Type `thrift:"1,required"`
   703  
   704  	// Set of all encodings used for this column. The purpose is to validate
   705  	// whether we can decode those pages.
   706  	Encoding []Encoding `thrift:"2,required"`
   707  
   708  	// Path in schema.
   709  	PathInSchema []string `thrift:"3,required"`
   710  
   711  	// Compression codec.
   712  	Codec CompressionCodec `thrift:"4,required"`
   713  
   714  	// Number of values in this column.
   715  	NumValues int64 `thrift:"5,required"`
   716  
   717  	// Total byte size of all uncompressed pages in this column chunk (including the headers).
   718  	TotalUncompressedSize int64 `thrift:"6,required"`
   719  
   720  	// Total byte size of all compressed, and potentially encrypted, pages
   721  	// in this column chunk (including the headers).
   722  	TotalCompressedSize int64 `thrift:"7,required"`
   723  
   724  	// Optional key/value metadata.
   725  	KeyValueMetadata []KeyValue `thrift:"8,optional"`
   726  
   727  	// Byte offset from beginning of file to first data page.
   728  	DataPageOffset int64 `thrift:"9,required"`
   729  
   730  	// Byte offset from beginning of file to root index page.
   731  	IndexPageOffset int64 `thrift:"10,optional"`
   732  
   733  	// Byte offset from the beginning of file to first (only) dictionary page.
   734  	DictionaryPageOffset int64 `thrift:"11,optional"`
   735  
   736  	// optional statistics for this column chunk.
   737  	Statistics Statistics `thrift:"12,optional"`
   738  
   739  	// Set of all encodings used for pages in this column chunk.
   740  	// This information can be used to determine if all data pages are
   741  	// dictionary encoded for example.
   742  	EncodingStats []PageEncodingStats `thrift:"13,optional"`
   743  
   744  	// Byte offset from beginning of file to Bloom filter data.
   745  	BloomFilterOffset int64 `thrift:"14,optional"`
   746  }
   747  
   748  type EncryptionWithFooterKey struct{}
   749  
   750  type EncryptionWithColumnKey struct {
   751  	// Column path in schema.
   752  	PathInSchema []string `thrift:"1,required"`
   753  
   754  	// Retrieval metadata of column encryption key.
   755  	KeyMetadata []byte `thrift:"2,optional"`
   756  }
   757  
   758  type ColumnCryptoMetaData struct {
   759  	EncryptionWithFooterKey *EncryptionWithFooterKey `thrift:"1"`
   760  	EncryptionWithColumnKey *EncryptionWithColumnKey `thrift:"2"`
   761  }
   762  
   763  type ColumnChunk struct {
   764  	// File where column data is stored.  If not set, assumed to be same file as
   765  	// metadata.  This path is relative to the current file.
   766  	FilePath string `thrift:"1,optional"`
   767  
   768  	// Byte offset in file_path to the ColumnMetaData.
   769  	FileOffset int64 `thrift:"2,required"`
   770  
   771  	// Column metadata for this chunk. This is the same content as what is at
   772  	// file_path/file_offset. Having it here has it replicated in the file
   773  	// metadata.
   774  	MetaData ColumnMetaData `thrift:"3,optional"`
   775  
   776  	// File offset of ColumnChunk's OffsetIndex.
   777  	OffsetIndexOffset int64 `thrift:"4,optional"`
   778  
   779  	// Size of ColumnChunk's OffsetIndex, in bytes.
   780  	OffsetIndexLength int32 `thrift:"5,optional"`
   781  
   782  	// File offset of ColumnChunk's ColumnIndex.
   783  	ColumnIndexOffset int64 `thrift:"6,optional"`
   784  
   785  	// Size of ColumnChunk's ColumnIndex, in bytes.
   786  	ColumnIndexLength int32 `thrift:"7,optional"`
   787  
   788  	// Crypto metadata of encrypted columns.
   789  	CryptoMetadata ColumnCryptoMetaData `thrift:"8,optional"`
   790  
   791  	// Encrypted column metadata for this chunk.
   792  	EncryptedColumnMetadata []byte `thrift:"9,optional"`
   793  }
   794  
   795  type RowGroup struct {
   796  	// Metadata for each column chunk in this row group.
   797  	// This list must have the same order as the SchemaElement list in FileMetaData.
   798  	Columns []ColumnChunk `thrift:"1,required"`
   799  
   800  	// Total byte size of all the uncompressed column data in this row group.
   801  	TotalByteSize int64 `thrift:"2,required"`
   802  
   803  	// Number of rows in this row group.
   804  	NumRows int64 `thrift:"3,required"`
   805  
   806  	// If set, specifies a sort ordering of the rows in this RowGroup.
   807  	// The sorting columns can be a subset of all the columns.
   808  	SortingColumns []SortingColumn `thrift:"4,optional"`
   809  
   810  	// Byte offset from beginning of file to first page (data or dictionary)
   811  	// in this row group
   812  	FileOffset int64 `thrift:"5,optional"`
   813  
   814  	// Total byte size of all compressed (and potentially encrypted) column data
   815  	// in this row group.
   816  	TotalCompressedSize int64 `thrift:"6,optional"`
   817  
   818  	// Row group ordinal in the file.
   819  	Ordinal int16 `thrift:"7,optional"`
   820  }
   821  
   822  // Empty struct to signal the order defined by the physical or logical type.
   823  type TypeDefinedOrder struct{}
   824  
   825  // Union to specify the order used for the min_value and max_value fields for a
   826  // column. This union takes the role of an enhanced enum that allows rich
   827  // elements (which will be needed for a collation-based ordering in the future).
   828  //
   829  // Possible values are:
   830  //
   831  //	TypeDefinedOrder - the column uses the order defined by its logical or
   832  //	                   physical type (if there is no logical type).
   833  //
   834  // If the reader does not support the value of this union, min and max stats
   835  // for this column should be ignored.
   836  type ColumnOrder struct { // union
   837  	// The sort orders for logical types are:
   838  	//   UTF8 - unsigned byte-wise comparison
   839  	//   INT8 - signed comparison
   840  	//   INT16 - signed comparison
   841  	//   INT32 - signed comparison
   842  	//   INT64 - signed comparison
   843  	//   UINT8 - unsigned comparison
   844  	//   UINT16 - unsigned comparison
   845  	//   UINT32 - unsigned comparison
   846  	//   UINT64 - unsigned comparison
   847  	//   DECIMAL - signed comparison of the represented value
   848  	//   DATE - signed comparison
   849  	//   TIME_MILLIS - signed comparison
   850  	//   TIME_MICROS - signed comparison
   851  	//   TIMESTAMP_MILLIS - signed comparison
   852  	//   TIMESTAMP_MICROS - signed comparison
   853  	//   INTERVAL - unsigned comparison
   854  	//   JSON - unsigned byte-wise comparison
   855  	//   BSON - unsigned byte-wise comparison
   856  	//   ENUM - unsigned byte-wise comparison
   857  	//   LIST - undefined
   858  	//   MAP - undefined
   859  	//
   860  	// In the absence of logical types, the sort order is determined by the physical type:
   861  	//   BOOLEAN - false, true
   862  	//   INT32 - signed comparison
   863  	//   INT64 - signed comparison
   864  	//   INT96 (only used for legacy timestamps) - undefined
   865  	//   FLOAT - signed comparison of the represented value (*)
   866  	//   DOUBLE - signed comparison of the represented value (*)
   867  	//   BYTE_ARRAY - unsigned byte-wise comparison
   868  	//   FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
   869  	//
   870  	// (*) Because the sorting order is not specified properly for floating
   871  	//     point values (relations vs. total ordering) the following
   872  	//     compatibility rules should be applied when reading statistics:
   873  	//     - If the min is a NaN, it should be ignored.
   874  	//     - If the max is a NaN, it should be ignored.
   875  	//     - If the min is +0, the row group may contain -0 values as well.
   876  	//     - If the max is -0, the row group may contain +0 values as well.
   877  	//     - When looking for NaN values, min and max should be ignored.
   878  	TypeOrder *TypeDefinedOrder `thrift:"1"`
   879  }
   880  
   881  type PageLocation struct {
   882  	// Offset of the page in the file.
   883  	Offset int64 `thrift:"1,required"`
   884  
   885  	// Size of the page, including header. Sum of compressed_page_size and
   886  	// header length.
   887  	CompressedPageSize int32 `thrift:"2,required"`
   888  
   889  	// Index within the RowGroup of the first row of the page; this means
   890  	// pages change on record boundaries (r = 0).
   891  	FirstRowIndex int64 `thrift:"3,required"`
   892  }
   893  
   894  type OffsetIndex struct {
   895  	// PageLocations, ordered by increasing PageLocation.offset. It is required
   896  	// that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
   897  	PageLocations []PageLocation `thrift:"1,required"`
   898  }
   899  
   900  // Description for ColumnIndex.
   901  // Each <array-field>[i] refers to the page at OffsetIndex.PageLocations[i]
   902  type ColumnIndex struct {
   903  	// A list of Boolean values to determine the validity of the corresponding
   904  	// min and max values. If true, a page contains only null values, and writers
   905  	// have to set the corresponding entries in min_values and max_values to
   906  	// byte[0], so that all lists have the same length. If false, the
   907  	// corresponding entries in min_values and max_values must be valid.
   908  	NullPages []bool `thrift:"1,required"`
   909  
   910  	// Two lists containing lower and upper bounds for the values of each page
   911  	// determined by the ColumnOrder of the column. These may be the actual
   912  	// minimum and maximum values found on a page, but can also be (more compact)
   913  	// values that do not exist on a page. For example, instead of storing ""Blart
   914  	// Versenwald III", a writer may set min_values[i]="B", max_values[i]="C".
   915  	// Such more compact values must still be valid values within the column's
   916  	// logical type. Readers must make sure that list entries are populated before
   917  	// using them by inspecting null_pages.
   918  	MinValues [][]byte `thrift:"2,required"`
   919  	MaxValues [][]byte `thrift:"3,required"`
   920  
   921  	// Stores whether both min_values and max_values are ordered and if so, in
   922  	// which direction. This allows readers to perform binary searches in both
   923  	// lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
   924  	// if the lists are ordered.
   925  	BoundaryOrder BoundaryOrder `thrift:"4,required"`
   926  
   927  	// A list containing the number of null values for each page.
   928  	NullCounts []int64 `thrift:"5,optional"`
   929  }
   930  
   931  type AesGcmV1 struct {
   932  	// AAD prefix.
   933  	AadPrefix []byte `thrift:"1,optional"`
   934  
   935  	// Unique file identifier part of AAD suffix.
   936  	AadFileUnique []byte `thrift:"2,optional"`
   937  
   938  	// In files encrypted with AAD prefix without storing it,
   939  	// readers must supply the prefix.
   940  	SupplyAadPrefix bool `thrift:"3,optional"`
   941  }
   942  
   943  type AesGcmCtrV1 struct {
   944  	// AAD prefix.
   945  	AadPrefix []byte `thrift:"1,optional"`
   946  
   947  	// Unique file identifier part of AAD suffix.
   948  	AadFileUnique []byte `thrift:"2,optional"`
   949  
   950  	// In files encrypted with AAD prefix without storing it,
   951  	// readers must supply the prefix.
   952  	SupplyAadPrefix bool `thrift:"3,optional"`
   953  }
   954  
   955  type EncryptionAlgorithm struct { // union
   956  	AesGcmV1    *AesGcmV1    `thrift:"1"`
   957  	AesGcmCtrV1 *AesGcmCtrV1 `thrift:"2"`
   958  }
   959  
   960  // Description for file metadata.
   961  type FileMetaData struct {
   962  	// Version of this file.
   963  	Version int32 `thrift:"1,required"`
   964  
   965  	// Parquet schema for this file.  This schema contains metadata for all the columns.
   966  	// The schema is represented as a tree with a single root.  The nodes of the tree
   967  	// are flattened to a list by doing a depth-first traversal.
   968  	// The column metadata contains the path in the schema for that column which can be
   969  	// used to map columns to nodes in the schema.
   970  	// The first element is the root.
   971  	Schema []SchemaElement `thrift:"2,required"`
   972  
   973  	// Number of rows in this file.
   974  	NumRows int64 `thrift:"3,required"`
   975  
   976  	// Row groups in this file.
   977  	RowGroups []RowGroup `thrift:"4,required"`
   978  
   979  	// Optional key/value metadata.
   980  	KeyValueMetadata []KeyValue `thrift:"5,optional"`
   981  
   982  	// String for application that wrote this file.  This should be in the format
   983  	// <Application> version <App Version> (build <App Build Hash>).
   984  	// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
   985  	CreatedBy string `thrift:"6,optional"`
   986  
   987  	// Sort order used for the min_value and max_value fields in the Statistics
   988  	// objects and the min_values and max_values fields in the ColumnIndex
   989  	// objects of each column in this file. Sort orders are listed in the order
   990  	// matching the columns in the schema. The indexes are not necessary the same
   991  	// though, because only leaf nodes of the schema are represented in the list
   992  	// of sort orders.
   993  	//
   994  	// Without column_orders, the meaning of the min_value and max_value fields
   995  	// in the Statistics object and the ColumnIndex object is undefined. To ensure
   996  	// well-defined behavior, if these fields are written to a Parquet file,
   997  	// column_orders must be written as well.
   998  	//
   999  	// The obsolete min and max fields in the Statistics object are always sorted
  1000  	// by signed comparison regardless of column_orders.
  1001  	ColumnOrders []ColumnOrder `thrift:"7,optional"`
  1002  
  1003  	// Encryption algorithm. This field is set only in encrypted files
  1004  	// with plaintext footer. Files with encrypted footer store algorithm id
  1005  	// in FileCryptoMetaData structure.
  1006  	EncryptionAlgorithm EncryptionAlgorithm `thrift:"8,optional"`
  1007  
  1008  	// Retrieval metadata of key used for signing the footer.
  1009  	// Used only in encrypted files with plaintext footer.
  1010  	FooterSigningKeyMetadata []byte `thrift:"9,optional"`
  1011  }
  1012  
  1013  // Crypto metadata for files with encrypted footer.
  1014  type FileCryptoMetaData struct {
  1015  	// Encryption algorithm. This field is only used for files
  1016  	// with encrypted footer. Files with plaintext footer store algorithm id
  1017  	// inside footer (FileMetaData structure).
  1018  	EncryptionAlgorithm EncryptionAlgorithm `thrift:"1,required"`
  1019  
  1020  	// Retrieval metadata of key used for encryption of footer,
  1021  	// and (possibly) columns.
  1022  	KeyMetadata []byte `thrift:"2,optional"`
  1023  }