storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/parquet.thrift (about)

     1  /**
     2   * Licensed to the Apache Software Foundation (ASF) under one
     3   * or more contributor license agreements.  See the NOTICE file
     4   * distributed with this work for additional information
     5   * regarding copyright ownership.  The ASF licenses this file
     6   * to you under the Apache License, Version 2.0 (the
     7   * "License"); you may not use this file except in compliance
     8   * with the License.  You may obtain a copy of the License at
     9   *
    10   *     http://www.apache.org/licenses/LICENSE-2.0
    11   *
    12   * Unless required by applicable law or agreed to in writing,
    13   * software distributed under the License is distributed on an
    14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    15   * KIND, either express or implied.  See the License for the
    16   * specific language governing permissions and limitations
    17   * under the License.
    18   */
    19  
    20  /**
    21   * File format description for the parquet file format
    22   */
    23  namespace cpp parquet
    24  namespace java org.apache.parquet.format
    25  
    26  /**
    27   * Types supported by Parquet.  These types are intended to be used in combination
    28   * with the encodings to control the on disk storage format.
    29   * For example INT16 is not included as a type since a good encoding of INT32
    30   * would handle this.
    31   */
    32  enum Type {
    33    BOOLEAN = 0;
    34    INT32 = 1;
    35    INT64 = 2;
    36    INT96 = 3;  // deprecated, only used by legacy implementations.
    37    FLOAT = 4;
    38    DOUBLE = 5;
    39    BYTE_ARRAY = 6;
    40    FIXED_LEN_BYTE_ARRAY = 7;
    41  }
    42  
    43  /**
    44   * Common types used by frameworks(e.g. hive, pig) using parquet.  This helps map
    45   * between types in those frameworks to the base types in parquet.  This is only
    46   * metadata and not needed to read or write the data.
    47   */
    48  enum ConvertedType {
    49    /** a BYTE_ARRAY actually contains UTF8 encoded chars */
    50    UTF8 = 0;
    51  
    52    /** a map is converted as an optional field containing a repeated key/value pair */
    53    MAP = 1;
    54  
    55    /** a key/value pair is converted into a group of two fields */
    56    MAP_KEY_VALUE = 2;
    57  
    58    /** a list is converted into an optional field containing a repeated field for its
    59     * values */
    60    LIST = 3;
    61  
    62    /** an enum is converted into a binary field */
    63    ENUM = 4;
    64  
    65    /**
    66     * A decimal value.
    67     *
    68     * This may be used to annotate binary or fixed primitive types. The
    69     * underlying byte array stores the unscaled value encoded as two's
    70     * complement using big-endian byte order (the most significant byte is the
    71     * zeroth element). The value of the decimal is the value * 10^{-scale}.
    72     *
    73     * This must be accompanied by a (maximum) precision and a scale in the
    74     * SchemaElement. The precision specifies the number of digits in the decimal
    75     * and the scale stores the location of the decimal point. For example 1.23
    76     * would have precision 3 (3 total digits) and scale 2 (the decimal point is
    77     * 2 digits over).
    78     */
    79    DECIMAL = 5;
    80  
    81    /**
    82     * A Date
    83     *
    84     * Stored as days since Unix epoch, encoded as the INT32 physical type.
    85     *
    86     */
    87    DATE = 6;
    88  
    89    /**
    90     * A time
    91     *
    92     * The total number of milliseconds since midnight.  The value is stored
    93     * as an INT32 physical type.
    94     */
    95    TIME_MILLIS = 7;
    96  
    97    /**
    98     * A time.
    99     *
   100     * The total number of microseconds since midnight.  The value is stored as
   101     * an INT64 physical type.
   102     */
   103    TIME_MICROS = 8;
   104  
   105    /**
   106     * A date/time combination
   107     *
   108     * Date and time recorded as milliseconds since the Unix epoch.  Recorded as
   109     * a physical type of INT64.
   110     */
   111    TIMESTAMP_MILLIS = 9;
   112  
   113    /**
   114     * A date/time combination
   115     *
   116     * Date and time recorded as microseconds since the Unix epoch.  The value is
   117     * stored as an INT64 physical type.
   118     */
   119    TIMESTAMP_MICROS = 10;
   120  
   121  
   122    /**
   123     * An unsigned integer value.
   124     *
   125     * The number describes the maximum number of meainful data bits in
   126     * the stored value. 8, 16 and 32 bit values are stored using the
   127     * INT32 physical type.  64 bit values are stored using the INT64
   128     * physical type.
   129     *
   130     */
   131    UINT_8 = 11;
   132    UINT_16 = 12;
   133    UINT_32 = 13;
   134    UINT_64 = 14;
   135  
   136    /**
   137     * A signed integer value.
   138     *
   139     * The number describes the maximum number of meainful data bits in
   140     * the stored value. 8, 16 and 32 bit values are stored using the
   141     * INT32 physical type.  64 bit values are stored using the INT64
   142     * physical type.
   143     *
   144     */
   145    INT_8 = 15;
   146    INT_16 = 16;
   147    INT_32 = 17;
   148    INT_64 = 18;
   149  
   150    /**
   151     * An embedded JSON document
   152     *
   153     * A JSON document embedded within a single UTF8 column.
   154     */
   155    JSON = 19;
   156  
   157    /**
   158     * An embedded BSON document
   159     *
   160     * A BSON document embedded within a single BINARY column.
   161     */
   162    BSON = 20;
   163  
   164    /**
   165     * An interval of time
   166     *
   167     * This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12
   168     * This data is composed of three separate little endian unsigned
   169     * integers.  Each stores a component of a duration of time.  The first
   170     * integer identifies the number of months associated with the duration,
   171     * the second identifies the number of days associated with the duration
   172     * and the third identifies the number of milliseconds associated with
   173     * the provided duration.  This duration of time is independent of any
   174     * particular timezone or date.
   175     */
   176    INTERVAL = 21;
   177  }
   178  
   179  /**
   180   * Representation of Schemas
   181   */
   182  enum FieldRepetitionType {
   183    /** This field is required (can not be null) and each record has exactly 1 value. */
   184    REQUIRED = 0;
   185  
   186    /** The field is optional (can be null) and each record has 0 or 1 values. */
   187    OPTIONAL = 1;
   188  
   189    /** The field is repeated and can contain 0 or more values */
   190    REPEATED = 2;
   191  }
   192  
   193  /**
   194   * Statistics per row group and per page
   195   * All fields are optional.
   196   */
   197  struct Statistics {
   198     /**
   199      * DEPRECATED: min and max value of the column. Use min_value and max_value.
   200      *
   201      * Values are encoded using PLAIN encoding, except that variable-length byte
   202      * arrays do not include a length prefix.
   203      *
   204      * These fields encode min and max values determined by signed comparison
   205      * only. New files should use the correct order for a column's logical type
   206      * and store the values in the min_value and max_value fields.
   207      *
   208      * To support older readers, these may be set when the column order is
   209      * signed.
   210      */
   211     1: optional binary max;
   212     2: optional binary min;
   213     /** count of null value in the column */
   214     3: optional i64 null_count;
   215     /** count of distinct values occurring */
   216     4: optional i64 distinct_count;
   217     /**
   218      * Min and max values for the column, determined by its ColumnOrder.
   219      *
   220      * Values are encoded using PLAIN encoding, except that variable-length byte
   221      * arrays do not include a length prefix.
   222      */
   223     5: optional binary max_value;
   224     6: optional binary min_value;
   225  }
   226  
   227  /** Empty structs to use as logical type annotations */
   228  struct StringType {}  // allowed for BINARY, must be encoded with UTF-8
   229  struct UUIDType {}    // allowed for FIXED[16], must encoded raw UUID bytes
   230  struct MapType {}     // see LogicalTypes.md
   231  struct ListType {}    // see LogicalTypes.md
   232  struct EnumType {}    // allowed for BINARY, must be encoded with UTF-8
   233  struct DateType {}    // allowed for INT32
   234  
   235  /**
   236   * Logical type to annotate a column that is always null.
   237   *
   238   * Sometimes when discovering the schema of existing data, values are always
   239   * null and the physical type can't be determined. This annotation signals
   240   * the case where the physical type was guessed from all null values.
   241   */
   242  struct NullType {}    // allowed for any physical type, only null values stored
   243  
   244  /**
   245   * Decimal logical type annotation
   246   *
   247   * To maintain forward-compatibility in v1, implementations using this logical
   248   * type must also set scale and precision on the annotated SchemaElement.
   249   *
   250   * Allowed for physical types: INT32, INT64, FIXED, and BINARY
   251   */
   252  struct DecimalType {
   253    1: required i32 scale
   254    2: required i32 precision
   255  }
   256  
   257  /** Time units for logical types */
   258  struct MilliSeconds {}
   259  struct MicroSeconds {}
   260  struct NanoSeconds {}
   261  union TimeUnit {
   262    1: MilliSeconds MILLIS
   263    2: MicroSeconds MICROS
   264    3: NanoSeconds NANOS
   265  }
   266  
   267  /**
   268   * Timestamp logical type annotation
   269   *
   270   * Allowed for physical types: INT64
   271   */
   272  struct TimestampType {
   273    1: required bool isAdjustedToUTC
   274    2: required TimeUnit unit
   275  }
   276  
   277  /**
   278   * Time logical type annotation
   279   *
   280   * Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
   281   */
   282  struct TimeType {
   283    1: required bool isAdjustedToUTC
   284    2: required TimeUnit unit
   285  }
   286  
   287  /**
   288   * Integer logical type annotation
   289   *
   290   * bitWidth must be 8, 16, 32, or 64.
   291   *
   292   * Allowed for physical types: INT32, INT64
   293   */
   294  struct IntType {
   295    1: required byte bitWidth
   296    2: required bool isSigned
   297  }
   298  
   299  /**
   300   * Embedded JSON logical type annotation
   301   *
   302   * Allowed for physical types: BINARY
   303   */
   304  struct JsonType {
   305  }
   306  
   307  /**
   308   * Embedded BSON logical type annotation
   309   *
   310   * Allowed for physical types: BINARY
   311   */
   312  struct BsonType {
   313  }
   314  
   315  /**
   316   * LogicalType annotations to replace ConvertedType.
   317   *
   318   * To maintain compatibility, implementations using LogicalType for a
   319   * SchemaElement must also set the corresponding ConvertedType from the
   320   * following table.
   321   */
   322  union LogicalType {
   323    1:  StringType STRING       // use ConvertedType UTF8
   324    2:  MapType MAP             // use ConvertedType MAP
   325    3:  ListType LIST           // use ConvertedType LIST
   326    4:  EnumType ENUM           // use ConvertedType ENUM
   327    5:  DecimalType DECIMAL     // use ConvertedType DECIMAL
   328    6:  DateType DATE           // use ConvertedType DATE
   329    7:  TimeType TIME           // use ConvertedType TIME_MICROS or TIME_MILLIS
   330    8:  TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS
   331    // 9: reserved for INTERVAL
   332    10: IntType INTEGER         // use ConvertedType INT_* or UINT_*
   333    11: NullType UNKNOWN        // no compatible ConvertedType
   334    12: JsonType JSON           // use ConvertedType JSON
   335    13: BsonType BSON           // use ConvertedType BSON
   336    14: UUIDType UUID
   337  }
   338  
   339  /**
   340   * Represents a element inside a schema definition.
   341   *  - if it is a group (inner node) then type is undefined and num_children is defined
   342   *  - if it is a primitive type (leaf) then type is defined and num_children is undefined
   343   * the nodes are listed in depth first traversal order.
   344   */
   345  struct SchemaElement {
   346    /** Data type for this field. Not set if the current element is a non-leaf node */
   347    1: optional Type type;
   348  
   349    /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales.
   350     * Otherwise, if specified, this is the maximum bit length to store any of the values.
   351     * (e.g. a low cardinality INT col could have this set to 3).  Note that this is
   352     * in the schema, and therefore fixed for the entire file.
   353     */
   354    2: optional i32 type_length;
   355  
   356    /** repetition of the field. The root of the schema does not have a repetition_type.
   357     * All other nodes must have one */
   358    3: optional FieldRepetitionType repetition_type;
   359  
   360    /** Name of the field in the schema */
   361    4: required string name;
   362  
   363    /** Nested fields.  Since thrift does not support nested fields,
   364     * the nesting is flattened to a single list by a depth-first traversal.
   365     * The children count is used to construct the nested relationship.
   366     * This field is not set when the element is a primitive type
   367     */
   368    5: optional i32 num_children;
   369  
   370    /** When the schema is the result of a conversion from another model
   371     * Used to record the original type to help with cross conversion.
   372     */
   373    6: optional ConvertedType converted_type;
   374  
   375    /** Used when this column contains decimal data.
   376     * See the DECIMAL converted type for more details.
   377     */
   378    7: optional i32 scale
   379    8: optional i32 precision
   380  
   381    /** When the original schema supports field ids, this will save the
   382     * original field id in the parquet schema
   383     */
   384    9: optional i32 field_id;
   385  
   386    /**
   387     * The logical type of this SchemaElement
   388     *
   389     * LogicalType replaces ConvertedType, but ConvertedType is still required
   390     * for some logical types to ensure forward-compatibility in format v1.
   391     */
   392    10: optional LogicalType logicalType
   393  }
   394  
   395  /**
   396   * Encodings supported by Parquet.  Not all encodings are valid for all types.  These
   397   * enums are also used to specify the encoding of definition and repetition levels.
   398   * See the accompanying doc for the details of the more complicated encodings.
   399   */
   400  enum Encoding {
   401    /** Default encoding.
   402     * BOOLEAN - 1 bit per value. 0 is false; 1 is true.
   403     * INT32 - 4 bytes per value.  Stored as little-endian.
   404     * INT64 - 8 bytes per value.  Stored as little-endian.
   405     * FLOAT - 4 bytes per value.  IEEE. Stored as little-endian.
   406     * DOUBLE - 8 bytes per value.  IEEE. Stored as little-endian.
   407     * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.
   408     * FIXED_LEN_BYTE_ARRAY - Just the bytes.
   409     */
   410    PLAIN = 0;
   411  
   412    /** Group VarInt encoding for INT32/INT64.
   413     * This encoding is deprecated. It was never used
   414     */
   415    //  GROUP_VAR_INT = 1;
   416  
   417    /**
   418     * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
   419     * plain type.
   420     * in a data page use RLE_DICTIONARY instead.
   421     * in a Dictionary page use PLAIN instead
   422     */
   423    PLAIN_DICTIONARY = 2;
   424  
   425    /** Group packed run length encoding. Usable for definition/repetition levels
   426     * encoding and Booleans (on one bit: 0 is false; 1 is true.)
   427     */
   428    RLE = 3;
   429  
   430    /** Bit packed encoding.  This can only be used if the data has a known max
   431     * width.  Usable for definition/repetition levels encoding.
   432     */
   433    BIT_PACKED = 4;
   434  
   435    /** Delta encoding for integers. This can be used for int columns and works best
   436     * on sorted data
   437     */
   438    DELTA_BINARY_PACKED = 5;
   439  
   440    /** Encoding for byte arrays to separate the length values and the data. The lengths
   441     * are encoded using DELTA_BINARY_PACKED
   442     */
   443    DELTA_LENGTH_BYTE_ARRAY = 6;
   444  
   445    /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED.
   446     * Suffixes are stored as delta length byte arrays.
   447     */
   448    DELTA_BYTE_ARRAY = 7;
   449  
   450    /** Dictionary encoding: the ids are encoded using the RLE encoding
   451     */
   452    RLE_DICTIONARY = 8;
   453  }
   454  
   455  /**
   456   * Supported compression algorithms.
   457   *
   458   * Codecs added in 2.4 can be read by readers based on 2.4 and later.
   459   * Codec support may vary between readers based on the format version and
   460   * libraries available at runtime. Gzip, Snappy, and LZ4 codecs are
   461   * widely available, while Zstd and Brotli require additional libraries.
   462   */
   463  enum CompressionCodec {
   464    UNCOMPRESSED = 0;
   465    SNAPPY = 1;
   466    GZIP = 2;
   467    LZO = 3;
   468    BROTLI = 4; // Added in 2.4
   469    LZ4 = 5;    // Added in 2.4
   470    ZSTD = 6;   // Added in 2.4
   471  }
   472  
   473  enum PageType {
   474    DATA_PAGE = 0;
   475    INDEX_PAGE = 1;
   476    DICTIONARY_PAGE = 2;
   477    DATA_PAGE_V2 = 3;
   478  }
   479  
   480  /**
   481   * Enum to annotate whether lists of min/max elements inside ColumnIndex
   482   * are ordered and if so, in which direction.
   483   */
   484  enum BoundaryOrder {
   485    UNORDERED = 0;
   486    ASCENDING = 1;
   487    DESCENDING = 2;
   488  }
   489  
   490  /** Data page header */
   491  struct DataPageHeader {
   492    /** Number of values, including NULLs, in this data page. **/
   493    1: required i32 num_values
   494  
   495    /** Encoding used for this data page **/
   496    2: required Encoding encoding
   497  
   498    /** Encoding used for definition levels **/
   499    3: required Encoding definition_level_encoding;
   500  
   501    /** Encoding used for repetition levels **/
   502    4: required Encoding repetition_level_encoding;
   503  
   504    /** Optional statistics for the data in this page**/
   505    5: optional Statistics statistics;
   506  }
   507  
   508  struct IndexPageHeader {
   509    /** TODO: **/
   510  }
   511  
   512  struct DictionaryPageHeader {
   513    /** Number of values in the dictionary **/
   514    1: required i32 num_values;
   515  
   516    /** Encoding using this dictionary page **/
   517    2: required Encoding encoding
   518  
   519    /** If true, the entries in the dictionary are sorted in ascending order **/
   520    3: optional bool is_sorted;
   521  }
   522  
   523  /**
   524   * New page format allowing reading levels without decompressing the data
   525   * Repetition and definition levels are uncompressed
   526   * The remaining section containing the data is compressed if is_compressed is true
   527   **/
   528  struct DataPageHeaderV2 {
   529    /** Number of values, including NULLs, in this data page. **/
   530    1: required i32 num_values
   531    /** Number of NULL values, in this data page.
   532        Number of non-null = num_values - num_nulls which is also the number of values in the data section **/
   533    2: required i32 num_nulls
   534    /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/
   535    3: required i32 num_rows
   536    /** Encoding used for data in this page **/
   537    4: required Encoding encoding
   538  
   539    // repetition levels and definition levels are always using RLE (without size in it)
   540  
   541    /** length of the definition levels */
   542    5: required i32 definition_levels_byte_length;
   543    /** length of the repetition levels */
   544    6: required i32 repetition_levels_byte_length;
   545  
   546    /**  whether the values are compressed.
   547    Which means the section of the page between
   548    definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
   549    is compressed with the compression_codec.
   550    If missing it is considered compressed */
   551    7: optional bool is_compressed = 1;
   552  
   553    /** optional statistics for this column chunk */
   554    8: optional Statistics statistics;
   555  }
   556  
   557  struct PageHeader {
   558    /** the type of the page: indicates which of the *_header fields is set **/
   559    1: required PageType type
   560  
   561    /** Uncompressed page size in bytes (not including this header) **/
   562    2: required i32 uncompressed_page_size
   563  
   564    /** Compressed page size in bytes (not including this header) **/
   565    3: required i32 compressed_page_size
   566  
   567    /** 32bit crc for the data below. This allows for disabling checksumming in HDFS
   568     *  if only a few pages needs to be read
   569     **/
   570    4: optional i32 crc
   571  
   572    // Headers for page specific data.  One only will be set.
   573    5: optional DataPageHeader data_page_header;
   574    6: optional IndexPageHeader index_page_header;
   575    7: optional DictionaryPageHeader dictionary_page_header;
   576    8: optional DataPageHeaderV2 data_page_header_v2;
   577  }
   578  
   579  /**
   580   * Wrapper struct to store key values
   581   */
   582   struct KeyValue {
   583    1: required string key
   584    2: optional string value
   585  }
   586  
   587  /**
   588   * Wrapper struct to specify sort order
   589   */
   590  struct SortingColumn {
   591    /** The column index (in this row group) **/
   592    1: required i32 column_idx
   593  
   594    /** If true, indicates this column is sorted in descending order. **/
   595    2: required bool descending
   596  
   597    /** If true, nulls will come before non-null values, otherwise,
   598     * nulls go at the end. */
   599    3: required bool nulls_first
   600  }
   601  
   602  /**
   603   * statistics of a given page type and encoding
   604   */
   605  struct PageEncodingStats {
   606  
   607    /** the page type (data/dic/...) **/
   608    1: required PageType page_type;
   609  
   610    /** encoding of the page **/
   611    2: required Encoding encoding;
   612  
   613    /** number of pages of this type with this encoding **/
   614    3: required i32 count;
   615  
   616  }
   617  
   618  /**
   619   * Description for column metadata
   620   */
   621  struct ColumnMetaData {
   622    /** Type of this column **/
   623    1: required Type type
   624  
   625    /** Set of all encodings used for this column. The purpose is to validate
   626     * whether we can decode those pages. **/
   627    2: required list<Encoding> encodings
   628  
   629    /** Path in schema **/
   630    3: required list<string> path_in_schema
   631  
   632    /** Compression codec **/
   633    4: required CompressionCodec codec
   634  
   635    /** Number of values in this column **/
   636    5: required i64 num_values
   637  
   638    /** total byte size of all uncompressed pages in this column chunk (including the headers) **/
   639    6: required i64 total_uncompressed_size
   640  
   641    /** total byte size of all compressed pages in this column chunk (including the headers) **/
   642    7: required i64 total_compressed_size
   643  
   644    /** Optional key/value metadata **/
   645    8: optional list<KeyValue> key_value_metadata
   646  
   647    /** Byte offset from beginning of file to first data page **/
   648    9: required i64 data_page_offset
   649  
   650    /** Byte offset from beginning of file to root index page **/
   651    10: optional i64 index_page_offset
   652  
   653    /** Byte offset from the beginning of file to first (only) dictionary page **/
   654    11: optional i64 dictionary_page_offset
   655  
   656    /** optional statistics for this column chunk */
   657    12: optional Statistics statistics;
   658  
   659    /** Set of all encodings used for pages in this column chunk.
   660     * This information can be used to determine if all data pages are
   661     * dictionary encoded for example **/
   662    13: optional list<PageEncodingStats> encoding_stats;
   663  }
   664  
   665  struct ColumnChunk {
   666    /** File where column data is stored.  If not set, assumed to be same file as
   667      * metadata.  This path is relative to the current file.
   668      **/
   669    1: optional string file_path
   670  
   671    /** Byte offset in file_path to the ColumnMetaData **/
   672    2: required i64 file_offset
   673  
   674    /** Column metadata for this chunk. This is the same content as what is at
   675     * file_path/file_offset.  Having it here has it replicated in the file
   676     * metadata.
   677     **/
   678    3: optional ColumnMetaData meta_data
   679  
   680    /** File offset of ColumnChunk's OffsetIndex **/
   681    4: optional i64 offset_index_offset
   682  
   683    /** Size of ColumnChunk's OffsetIndex, in bytes **/
   684    5: optional i32 offset_index_length
   685  
   686    /** File offset of ColumnChunk's ColumnIndex **/
   687    6: optional i64 column_index_offset
   688  
   689    /** Size of ColumnChunk's ColumnIndex, in bytes **/
   690    7: optional i32 column_index_length
   691  }
   692  
   693  struct RowGroup {
   694    /** Metadata for each column chunk in this row group.
   695     * This list must have the same order as the SchemaElement list in FileMetaData.
   696     **/
   697    1: required list<ColumnChunk> columns
   698  
   699    /** Total byte size of all the uncompressed column data in this row group **/
   700    2: required i64 total_byte_size
   701  
   702    /** Number of rows in this row group **/
   703    3: required i64 num_rows
   704  
   705    /** If set, specifies a sort ordering of the rows in this RowGroup.
   706     * The sorting columns can be a subset of all the columns.
   707     */
   708    4: optional list<SortingColumn> sorting_columns
   709  }
   710  
   711  /** Empty struct to signal the order defined by the physical or logical type */
   712  struct TypeDefinedOrder {}
   713  
   714  /**
   715   * Union to specify the order used for the min_value and max_value fields for a
   716   * column. This union takes the role of an enhanced enum that allows rich
   717   * elements (which will be needed for a collation-based ordering in the future).
   718   *
   719   * Possible values are:
   720   * * TypeDefinedOrder - the column uses the order defined by its logical or
   721   *                      physical type (if there is no logical type).
   722   *
   723   * If the reader does not support the value of this union, min and max stats
   724   * for this column should be ignored.
   725   */
   726  union ColumnOrder {
   727  
   728    /**
   729     * The sort orders for logical types are:
   730     *   UTF8 - unsigned byte-wise comparison
   731     *   INT8 - signed comparison
   732     *   INT16 - signed comparison
   733     *   INT32 - signed comparison
   734     *   INT64 - signed comparison
   735     *   UINT8 - unsigned comparison
   736     *   UINT16 - unsigned comparison
   737     *   UINT32 - unsigned comparison
   738     *   UINT64 - unsigned comparison
   739     *   DECIMAL - signed comparison of the represented value
   740     *   DATE - signed comparison
   741     *   TIME_MILLIS - signed comparison
   742     *   TIME_MICROS - signed comparison
   743     *   TIMESTAMP_MILLIS - signed comparison
   744     *   TIMESTAMP_MICROS - signed comparison
   745     *   INTERVAL - unsigned comparison
   746     *   JSON - unsigned byte-wise comparison
   747     *   BSON - unsigned byte-wise comparison
   748     *   ENUM - unsigned byte-wise comparison
   749     *   LIST - undefined
   750     *   MAP - undefined
   751     *
   752     * In the absence of logical types, the sort order is determined by the physical type:
   753     *   BOOLEAN - false, true
   754     *   INT32 - signed comparison
   755     *   INT64 - signed comparison
   756     *   INT96 (only used for legacy timestamps) - undefined
   757     *   FLOAT - signed comparison of the represented value (*)
   758     *   DOUBLE - signed comparison of the represented value (*)
   759     *   BYTE_ARRAY - unsigned byte-wise comparison
   760     *   FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
   761     *
   762     * (*) Because the sorting order is not specified properly for floating
   763     *     point values (relations vs. total ordering) the following
   764     *     compatibility rules should be applied when reading statistics:
   765     *     - If the min is a NaN, it should be ignored.
   766     *     - If the max is a NaN, it should be ignored.
   767     *     - If the min is +0, the row group may contain -0 values as well.
   768     *     - If the max is -0, the row group may contain +0 values as well.
   769     *     - When looking for NaN values, min and max should be ignored.
   770     */
   771    1: TypeDefinedOrder TYPE_ORDER;
   772  }
   773  
   774  struct PageLocation {
   775    /** Offset of the page in the file **/
   776    1: required i64 offset
   777  
   778    /**
   779     * Size of the page, including header. Sum of compressed_page_size and header
   780     * length
   781     */
   782    2: required i32 compressed_page_size
   783  
   784    /**
   785     * Index within the RowGroup of the first row of the page; this means pages
   786     * change on record boundaries (r = 0).
   787     */
   788    3: required i64 first_row_index
   789  }
   790  
   791  struct OffsetIndex {
   792    /**
   793     * PageLocations, ordered by increasing PageLocation.offset. It is required
   794     * that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
   795     */
   796    1: required list<PageLocation> page_locations
   797  }
   798  
   799  /**
   800   * Description for ColumnIndex.
   801   * Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i]
   802   */
   803  struct ColumnIndex {
   804    /**
   805     * A list of Boolean values to determine the validity of the corresponding
   806     * min and max values. If true, a page contains only null values, and writers
   807     * have to set the corresponding entries in min_values and max_values to
   808     * byte[0], so that all lists have the same length. If false, the
   809     * corresponding entries in min_values and max_values must be valid.
   810     */
   811    1: required list<bool> null_pages
   812  
   813    /**
   814     * Two lists containing lower and upper bounds for the values of each page.
   815     * These may be the actual minimum and maximum values found on a page, but
   816     * can also be (more compact) values that do not exist on a page. For
   817     * example, instead of storing ""Blart Versenwald III", a writer may set
   818     * min_values[i]="B", max_values[i]="C". Such more compact values must still
   819     * be valid values within the column's logical type. Readers must make sure
   820     * that list entries are populated before using them by inspecting null_pages.
   821     */
   822    2: required list<binary> min_values
   823    3: required list<binary> max_values
   824  
   825    /**
   826     * Stores whether both min_values and max_values are orderd and if so, in
   827     * which direction. This allows readers to perform binary searches in both
   828     * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
   829     * if the lists are ordered.
   830     */
   831    4: required BoundaryOrder boundary_order
   832  
   833    /** A list containing the number of null values for each page **/
   834    5: optional list<i64> null_counts
   835  }
   836  
   837  /**
   838   * Description for file metadata
   839   */
   840  struct FileMetaData {
   841    /** Version of this file **/
   842    1: required i32 version
   843  
   844    /** Parquet schema for this file.  This schema contains metadata for all the columns.
   845     * The schema is represented as a tree with a single root.  The nodes of the tree
   846     * are flattened to a list by doing a depth-first traversal.
   847     * The column metadata contains the path in the schema for that column which can be
   848     * used to map columns to nodes in the schema.
   849     * The first element is the root **/
   850    2: required list<SchemaElement> schema;
   851  
   852    /** Number of rows in this file **/
   853    3: required i64 num_rows
   854  
   855    /** Row groups in this file **/
   856    4: required list<RowGroup> row_groups
   857  
   858    /** Optional key/value metadata **/
   859    5: optional list<KeyValue> key_value_metadata
   860  
   861    /** String for application that wrote this file.  This should be in the format
   862     * <Application> version <App Version> (build <App Build Hash>).
   863     * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
   864     **/
   865    6: optional string created_by
   866  
   867    /**
   868     * Sort order used for the min_value and max_value fields of each column in
   869     * this file. Each sort order corresponds to one column, determined by its
   870     * position in the list, matching the position of the column in the schema.
   871     *
   872     * Without column_orders, the meaning of the min_value and max_value fields is
   873     * undefined. To ensure well-defined behavior, if min_value and max_value are
   874     * written to a Parquet file, column_orders must be written as well.
   875     *
   876     * The obsolete min and max fields are always sorted by signed comparison
   877     * regardless of column_orders.
   878     */
   879    7: optional list<ColumnOrder> column_orders;
   880  }
   881