storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/parquet.thrift (about) 1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 20 /** 21 * File format description for the parquet file format 22 */ 23 namespace cpp parquet 24 namespace java org.apache.parquet.format 25 26 /** 27 * Types supported by Parquet. These types are intended to be used in combination 28 * with the encodings to control the on disk storage format. 29 * For example INT16 is not included as a type since a good encoding of INT32 30 * would handle this. 31 */ 32 enum Type { 33 BOOLEAN = 0; 34 INT32 = 1; 35 INT64 = 2; 36 INT96 = 3; // deprecated, only used by legacy implementations. 37 FLOAT = 4; 38 DOUBLE = 5; 39 BYTE_ARRAY = 6; 40 FIXED_LEN_BYTE_ARRAY = 7; 41 } 42 43 /** 44 * Common types used by frameworks(e.g. hive, pig) using parquet. This helps map 45 * between types in those frameworks to the base types in parquet. This is only 46 * metadata and not needed to read or write the data. 47 */ 48 enum ConvertedType { 49 /** a BYTE_ARRAY actually contains UTF8 encoded chars */ 50 UTF8 = 0; 51 52 /** a map is converted as an optional field containing a repeated key/value pair */ 53 MAP = 1; 54 55 /** a key/value pair is converted into a group of two fields */ 56 MAP_KEY_VALUE = 2; 57 58 /** a list is converted into an optional field containing a repeated field for its 59 * values */ 60 LIST = 3; 61 62 /** an enum is converted into a binary field */ 63 ENUM = 4; 64 65 /** 66 * A decimal value. 67 * 68 * This may be used to annotate binary or fixed primitive types. The 69 * underlying byte array stores the unscaled value encoded as two's 70 * complement using big-endian byte order (the most significant byte is the 71 * zeroth element). The value of the decimal is the value * 10^{-scale}. 72 * 73 * This must be accompanied by a (maximum) precision and a scale in the 74 * SchemaElement. The precision specifies the number of digits in the decimal 75 * and the scale stores the location of the decimal point. For example 1.23 76 * would have precision 3 (3 total digits) and scale 2 (the decimal point is 77 * 2 digits over). 78 */ 79 DECIMAL = 5; 80 81 /** 82 * A Date 83 * 84 * Stored as days since Unix epoch, encoded as the INT32 physical type. 85 * 86 */ 87 DATE = 6; 88 89 /** 90 * A time 91 * 92 * The total number of milliseconds since midnight. The value is stored 93 * as an INT32 physical type. 94 */ 95 TIME_MILLIS = 7; 96 97 /** 98 * A time. 99 * 100 * The total number of microseconds since midnight. The value is stored as 101 * an INT64 physical type. 102 */ 103 TIME_MICROS = 8; 104 105 /** 106 * A date/time combination 107 * 108 * Date and time recorded as milliseconds since the Unix epoch. Recorded as 109 * a physical type of INT64. 110 */ 111 TIMESTAMP_MILLIS = 9; 112 113 /** 114 * A date/time combination 115 * 116 * Date and time recorded as microseconds since the Unix epoch. The value is 117 * stored as an INT64 physical type. 118 */ 119 TIMESTAMP_MICROS = 10; 120 121 122 /** 123 * An unsigned integer value. 124 * 125 * The number describes the maximum number of meainful data bits in 126 * the stored value. 8, 16 and 32 bit values are stored using the 127 * INT32 physical type. 64 bit values are stored using the INT64 128 * physical type. 129 * 130 */ 131 UINT_8 = 11; 132 UINT_16 = 12; 133 UINT_32 = 13; 134 UINT_64 = 14; 135 136 /** 137 * A signed integer value. 138 * 139 * The number describes the maximum number of meainful data bits in 140 * the stored value. 8, 16 and 32 bit values are stored using the 141 * INT32 physical type. 64 bit values are stored using the INT64 142 * physical type. 143 * 144 */ 145 INT_8 = 15; 146 INT_16 = 16; 147 INT_32 = 17; 148 INT_64 = 18; 149 150 /** 151 * An embedded JSON document 152 * 153 * A JSON document embedded within a single UTF8 column. 154 */ 155 JSON = 19; 156 157 /** 158 * An embedded BSON document 159 * 160 * A BSON document embedded within a single BINARY column. 161 */ 162 BSON = 20; 163 164 /** 165 * An interval of time 166 * 167 * This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 168 * This data is composed of three separate little endian unsigned 169 * integers. Each stores a component of a duration of time. The first 170 * integer identifies the number of months associated with the duration, 171 * the second identifies the number of days associated with the duration 172 * and the third identifies the number of milliseconds associated with 173 * the provided duration. This duration of time is independent of any 174 * particular timezone or date. 175 */ 176 INTERVAL = 21; 177 } 178 179 /** 180 * Representation of Schemas 181 */ 182 enum FieldRepetitionType { 183 /** This field is required (can not be null) and each record has exactly 1 value. */ 184 REQUIRED = 0; 185 186 /** The field is optional (can be null) and each record has 0 or 1 values. */ 187 OPTIONAL = 1; 188 189 /** The field is repeated and can contain 0 or more values */ 190 REPEATED = 2; 191 } 192 193 /** 194 * Statistics per row group and per page 195 * All fields are optional. 196 */ 197 struct Statistics { 198 /** 199 * DEPRECATED: min and max value of the column. Use min_value and max_value. 200 * 201 * Values are encoded using PLAIN encoding, except that variable-length byte 202 * arrays do not include a length prefix. 203 * 204 * These fields encode min and max values determined by signed comparison 205 * only. New files should use the correct order for a column's logical type 206 * and store the values in the min_value and max_value fields. 207 * 208 * To support older readers, these may be set when the column order is 209 * signed. 210 */ 211 1: optional binary max; 212 2: optional binary min; 213 /** count of null value in the column */ 214 3: optional i64 null_count; 215 /** count of distinct values occurring */ 216 4: optional i64 distinct_count; 217 /** 218 * Min and max values for the column, determined by its ColumnOrder. 219 * 220 * Values are encoded using PLAIN encoding, except that variable-length byte 221 * arrays do not include a length prefix. 222 */ 223 5: optional binary max_value; 224 6: optional binary min_value; 225 } 226 227 /** Empty structs to use as logical type annotations */ 228 struct StringType {} // allowed for BINARY, must be encoded with UTF-8 229 struct UUIDType {} // allowed for FIXED[16], must encoded raw UUID bytes 230 struct MapType {} // see LogicalTypes.md 231 struct ListType {} // see LogicalTypes.md 232 struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 233 struct DateType {} // allowed for INT32 234 235 /** 236 * Logical type to annotate a column that is always null. 237 * 238 * Sometimes when discovering the schema of existing data, values are always 239 * null and the physical type can't be determined. This annotation signals 240 * the case where the physical type was guessed from all null values. 241 */ 242 struct NullType {} // allowed for any physical type, only null values stored 243 244 /** 245 * Decimal logical type annotation 246 * 247 * To maintain forward-compatibility in v1, implementations using this logical 248 * type must also set scale and precision on the annotated SchemaElement. 249 * 250 * Allowed for physical types: INT32, INT64, FIXED, and BINARY 251 */ 252 struct DecimalType { 253 1: required i32 scale 254 2: required i32 precision 255 } 256 257 /** Time units for logical types */ 258 struct MilliSeconds {} 259 struct MicroSeconds {} 260 struct NanoSeconds {} 261 union TimeUnit { 262 1: MilliSeconds MILLIS 263 2: MicroSeconds MICROS 264 3: NanoSeconds NANOS 265 } 266 267 /** 268 * Timestamp logical type annotation 269 * 270 * Allowed for physical types: INT64 271 */ 272 struct TimestampType { 273 1: required bool isAdjustedToUTC 274 2: required TimeUnit unit 275 } 276 277 /** 278 * Time logical type annotation 279 * 280 * Allowed for physical types: INT32 (millis), INT64 (micros, nanos) 281 */ 282 struct TimeType { 283 1: required bool isAdjustedToUTC 284 2: required TimeUnit unit 285 } 286 287 /** 288 * Integer logical type annotation 289 * 290 * bitWidth must be 8, 16, 32, or 64. 291 * 292 * Allowed for physical types: INT32, INT64 293 */ 294 struct IntType { 295 1: required byte bitWidth 296 2: required bool isSigned 297 } 298 299 /** 300 * Embedded JSON logical type annotation 301 * 302 * Allowed for physical types: BINARY 303 */ 304 struct JsonType { 305 } 306 307 /** 308 * Embedded BSON logical type annotation 309 * 310 * Allowed for physical types: BINARY 311 */ 312 struct BsonType { 313 } 314 315 /** 316 * LogicalType annotations to replace ConvertedType. 317 * 318 * To maintain compatibility, implementations using LogicalType for a 319 * SchemaElement must also set the corresponding ConvertedType from the 320 * following table. 321 */ 322 union LogicalType { 323 1: StringType STRING // use ConvertedType UTF8 324 2: MapType MAP // use ConvertedType MAP 325 3: ListType LIST // use ConvertedType LIST 326 4: EnumType ENUM // use ConvertedType ENUM 327 5: DecimalType DECIMAL // use ConvertedType DECIMAL 328 6: DateType DATE // use ConvertedType DATE 329 7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS 330 8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS 331 // 9: reserved for INTERVAL 332 10: IntType INTEGER // use ConvertedType INT_* or UINT_* 333 11: NullType UNKNOWN // no compatible ConvertedType 334 12: JsonType JSON // use ConvertedType JSON 335 13: BsonType BSON // use ConvertedType BSON 336 14: UUIDType UUID 337 } 338 339 /** 340 * Represents a element inside a schema definition. 341 * - if it is a group (inner node) then type is undefined and num_children is defined 342 * - if it is a primitive type (leaf) then type is defined and num_children is undefined 343 * the nodes are listed in depth first traversal order. 344 */ 345 struct SchemaElement { 346 /** Data type for this field. Not set if the current element is a non-leaf node */ 347 1: optional Type type; 348 349 /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. 350 * Otherwise, if specified, this is the maximum bit length to store any of the values. 351 * (e.g. a low cardinality INT col could have this set to 3). Note that this is 352 * in the schema, and therefore fixed for the entire file. 353 */ 354 2: optional i32 type_length; 355 356 /** repetition of the field. The root of the schema does not have a repetition_type. 357 * All other nodes must have one */ 358 3: optional FieldRepetitionType repetition_type; 359 360 /** Name of the field in the schema */ 361 4: required string name; 362 363 /** Nested fields. Since thrift does not support nested fields, 364 * the nesting is flattened to a single list by a depth-first traversal. 365 * The children count is used to construct the nested relationship. 366 * This field is not set when the element is a primitive type 367 */ 368 5: optional i32 num_children; 369 370 /** When the schema is the result of a conversion from another model 371 * Used to record the original type to help with cross conversion. 372 */ 373 6: optional ConvertedType converted_type; 374 375 /** Used when this column contains decimal data. 376 * See the DECIMAL converted type for more details. 377 */ 378 7: optional i32 scale 379 8: optional i32 precision 380 381 /** When the original schema supports field ids, this will save the 382 * original field id in the parquet schema 383 */ 384 9: optional i32 field_id; 385 386 /** 387 * The logical type of this SchemaElement 388 * 389 * LogicalType replaces ConvertedType, but ConvertedType is still required 390 * for some logical types to ensure forward-compatibility in format v1. 391 */ 392 10: optional LogicalType logicalType 393 } 394 395 /** 396 * Encodings supported by Parquet. Not all encodings are valid for all types. These 397 * enums are also used to specify the encoding of definition and repetition levels. 398 * See the accompanying doc for the details of the more complicated encodings. 399 */ 400 enum Encoding { 401 /** Default encoding. 402 * BOOLEAN - 1 bit per value. 0 is false; 1 is true. 403 * INT32 - 4 bytes per value. Stored as little-endian. 404 * INT64 - 8 bytes per value. Stored as little-endian. 405 * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. 406 * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. 407 * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. 408 * FIXED_LEN_BYTE_ARRAY - Just the bytes. 409 */ 410 PLAIN = 0; 411 412 /** Group VarInt encoding for INT32/INT64. 413 * This encoding is deprecated. It was never used 414 */ 415 // GROUP_VAR_INT = 1; 416 417 /** 418 * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the 419 * plain type. 420 * in a data page use RLE_DICTIONARY instead. 421 * in a Dictionary page use PLAIN instead 422 */ 423 PLAIN_DICTIONARY = 2; 424 425 /** Group packed run length encoding. Usable for definition/repetition levels 426 * encoding and Booleans (on one bit: 0 is false; 1 is true.) 427 */ 428 RLE = 3; 429 430 /** Bit packed encoding. This can only be used if the data has a known max 431 * width. Usable for definition/repetition levels encoding. 432 */ 433 BIT_PACKED = 4; 434 435 /** Delta encoding for integers. This can be used for int columns and works best 436 * on sorted data 437 */ 438 DELTA_BINARY_PACKED = 5; 439 440 /** Encoding for byte arrays to separate the length values and the data. The lengths 441 * are encoded using DELTA_BINARY_PACKED 442 */ 443 DELTA_LENGTH_BYTE_ARRAY = 6; 444 445 /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. 446 * Suffixes are stored as delta length byte arrays. 447 */ 448 DELTA_BYTE_ARRAY = 7; 449 450 /** Dictionary encoding: the ids are encoded using the RLE encoding 451 */ 452 RLE_DICTIONARY = 8; 453 } 454 455 /** 456 * Supported compression algorithms. 457 * 458 * Codecs added in 2.4 can be read by readers based on 2.4 and later. 459 * Codec support may vary between readers based on the format version and 460 * libraries available at runtime. Gzip, Snappy, and LZ4 codecs are 461 * widely available, while Zstd and Brotli require additional libraries. 462 */ 463 enum CompressionCodec { 464 UNCOMPRESSED = 0; 465 SNAPPY = 1; 466 GZIP = 2; 467 LZO = 3; 468 BROTLI = 4; // Added in 2.4 469 LZ4 = 5; // Added in 2.4 470 ZSTD = 6; // Added in 2.4 471 } 472 473 enum PageType { 474 DATA_PAGE = 0; 475 INDEX_PAGE = 1; 476 DICTIONARY_PAGE = 2; 477 DATA_PAGE_V2 = 3; 478 } 479 480 /** 481 * Enum to annotate whether lists of min/max elements inside ColumnIndex 482 * are ordered and if so, in which direction. 483 */ 484 enum BoundaryOrder { 485 UNORDERED = 0; 486 ASCENDING = 1; 487 DESCENDING = 2; 488 } 489 490 /** Data page header */ 491 struct DataPageHeader { 492 /** Number of values, including NULLs, in this data page. **/ 493 1: required i32 num_values 494 495 /** Encoding used for this data page **/ 496 2: required Encoding encoding 497 498 /** Encoding used for definition levels **/ 499 3: required Encoding definition_level_encoding; 500 501 /** Encoding used for repetition levels **/ 502 4: required Encoding repetition_level_encoding; 503 504 /** Optional statistics for the data in this page**/ 505 5: optional Statistics statistics; 506 } 507 508 struct IndexPageHeader { 509 /** TODO: **/ 510 } 511 512 struct DictionaryPageHeader { 513 /** Number of values in the dictionary **/ 514 1: required i32 num_values; 515 516 /** Encoding using this dictionary page **/ 517 2: required Encoding encoding 518 519 /** If true, the entries in the dictionary are sorted in ascending order **/ 520 3: optional bool is_sorted; 521 } 522 523 /** 524 * New page format allowing reading levels without decompressing the data 525 * Repetition and definition levels are uncompressed 526 * The remaining section containing the data is compressed if is_compressed is true 527 **/ 528 struct DataPageHeaderV2 { 529 /** Number of values, including NULLs, in this data page. **/ 530 1: required i32 num_values 531 /** Number of NULL values, in this data page. 532 Number of non-null = num_values - num_nulls which is also the number of values in the data section **/ 533 2: required i32 num_nulls 534 /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/ 535 3: required i32 num_rows 536 /** Encoding used for data in this page **/ 537 4: required Encoding encoding 538 539 // repetition levels and definition levels are always using RLE (without size in it) 540 541 /** length of the definition levels */ 542 5: required i32 definition_levels_byte_length; 543 /** length of the repetition levels */ 544 6: required i32 repetition_levels_byte_length; 545 546 /** whether the values are compressed. 547 Which means the section of the page between 548 definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) 549 is compressed with the compression_codec. 550 If missing it is considered compressed */ 551 7: optional bool is_compressed = 1; 552 553 /** optional statistics for this column chunk */ 554 8: optional Statistics statistics; 555 } 556 557 struct PageHeader { 558 /** the type of the page: indicates which of the *_header fields is set **/ 559 1: required PageType type 560 561 /** Uncompressed page size in bytes (not including this header) **/ 562 2: required i32 uncompressed_page_size 563 564 /** Compressed page size in bytes (not including this header) **/ 565 3: required i32 compressed_page_size 566 567 /** 32bit crc for the data below. This allows for disabling checksumming in HDFS 568 * if only a few pages needs to be read 569 **/ 570 4: optional i32 crc 571 572 // Headers for page specific data. One only will be set. 573 5: optional DataPageHeader data_page_header; 574 6: optional IndexPageHeader index_page_header; 575 7: optional DictionaryPageHeader dictionary_page_header; 576 8: optional DataPageHeaderV2 data_page_header_v2; 577 } 578 579 /** 580 * Wrapper struct to store key values 581 */ 582 struct KeyValue { 583 1: required string key 584 2: optional string value 585 } 586 587 /** 588 * Wrapper struct to specify sort order 589 */ 590 struct SortingColumn { 591 /** The column index (in this row group) **/ 592 1: required i32 column_idx 593 594 /** If true, indicates this column is sorted in descending order. **/ 595 2: required bool descending 596 597 /** If true, nulls will come before non-null values, otherwise, 598 * nulls go at the end. */ 599 3: required bool nulls_first 600 } 601 602 /** 603 * statistics of a given page type and encoding 604 */ 605 struct PageEncodingStats { 606 607 /** the page type (data/dic/...) **/ 608 1: required PageType page_type; 609 610 /** encoding of the page **/ 611 2: required Encoding encoding; 612 613 /** number of pages of this type with this encoding **/ 614 3: required i32 count; 615 616 } 617 618 /** 619 * Description for column metadata 620 */ 621 struct ColumnMetaData { 622 /** Type of this column **/ 623 1: required Type type 624 625 /** Set of all encodings used for this column. The purpose is to validate 626 * whether we can decode those pages. **/ 627 2: required list<Encoding> encodings 628 629 /** Path in schema **/ 630 3: required list<string> path_in_schema 631 632 /** Compression codec **/ 633 4: required CompressionCodec codec 634 635 /** Number of values in this column **/ 636 5: required i64 num_values 637 638 /** total byte size of all uncompressed pages in this column chunk (including the headers) **/ 639 6: required i64 total_uncompressed_size 640 641 /** total byte size of all compressed pages in this column chunk (including the headers) **/ 642 7: required i64 total_compressed_size 643 644 /** Optional key/value metadata **/ 645 8: optional list<KeyValue> key_value_metadata 646 647 /** Byte offset from beginning of file to first data page **/ 648 9: required i64 data_page_offset 649 650 /** Byte offset from beginning of file to root index page **/ 651 10: optional i64 index_page_offset 652 653 /** Byte offset from the beginning of file to first (only) dictionary page **/ 654 11: optional i64 dictionary_page_offset 655 656 /** optional statistics for this column chunk */ 657 12: optional Statistics statistics; 658 659 /** Set of all encodings used for pages in this column chunk. 660 * This information can be used to determine if all data pages are 661 * dictionary encoded for example **/ 662 13: optional list<PageEncodingStats> encoding_stats; 663 } 664 665 struct ColumnChunk { 666 /** File where column data is stored. If not set, assumed to be same file as 667 * metadata. This path is relative to the current file. 668 **/ 669 1: optional string file_path 670 671 /** Byte offset in file_path to the ColumnMetaData **/ 672 2: required i64 file_offset 673 674 /** Column metadata for this chunk. This is the same content as what is at 675 * file_path/file_offset. Having it here has it replicated in the file 676 * metadata. 677 **/ 678 3: optional ColumnMetaData meta_data 679 680 /** File offset of ColumnChunk's OffsetIndex **/ 681 4: optional i64 offset_index_offset 682 683 /** Size of ColumnChunk's OffsetIndex, in bytes **/ 684 5: optional i32 offset_index_length 685 686 /** File offset of ColumnChunk's ColumnIndex **/ 687 6: optional i64 column_index_offset 688 689 /** Size of ColumnChunk's ColumnIndex, in bytes **/ 690 7: optional i32 column_index_length 691 } 692 693 struct RowGroup { 694 /** Metadata for each column chunk in this row group. 695 * This list must have the same order as the SchemaElement list in FileMetaData. 696 **/ 697 1: required list<ColumnChunk> columns 698 699 /** Total byte size of all the uncompressed column data in this row group **/ 700 2: required i64 total_byte_size 701 702 /** Number of rows in this row group **/ 703 3: required i64 num_rows 704 705 /** If set, specifies a sort ordering of the rows in this RowGroup. 706 * The sorting columns can be a subset of all the columns. 707 */ 708 4: optional list<SortingColumn> sorting_columns 709 } 710 711 /** Empty struct to signal the order defined by the physical or logical type */ 712 struct TypeDefinedOrder {} 713 714 /** 715 * Union to specify the order used for the min_value and max_value fields for a 716 * column. This union takes the role of an enhanced enum that allows rich 717 * elements (which will be needed for a collation-based ordering in the future). 718 * 719 * Possible values are: 720 * * TypeDefinedOrder - the column uses the order defined by its logical or 721 * physical type (if there is no logical type). 722 * 723 * If the reader does not support the value of this union, min and max stats 724 * for this column should be ignored. 725 */ 726 union ColumnOrder { 727 728 /** 729 * The sort orders for logical types are: 730 * UTF8 - unsigned byte-wise comparison 731 * INT8 - signed comparison 732 * INT16 - signed comparison 733 * INT32 - signed comparison 734 * INT64 - signed comparison 735 * UINT8 - unsigned comparison 736 * UINT16 - unsigned comparison 737 * UINT32 - unsigned comparison 738 * UINT64 - unsigned comparison 739 * DECIMAL - signed comparison of the represented value 740 * DATE - signed comparison 741 * TIME_MILLIS - signed comparison 742 * TIME_MICROS - signed comparison 743 * TIMESTAMP_MILLIS - signed comparison 744 * TIMESTAMP_MICROS - signed comparison 745 * INTERVAL - unsigned comparison 746 * JSON - unsigned byte-wise comparison 747 * BSON - unsigned byte-wise comparison 748 * ENUM - unsigned byte-wise comparison 749 * LIST - undefined 750 * MAP - undefined 751 * 752 * In the absence of logical types, the sort order is determined by the physical type: 753 * BOOLEAN - false, true 754 * INT32 - signed comparison 755 * INT64 - signed comparison 756 * INT96 (only used for legacy timestamps) - undefined 757 * FLOAT - signed comparison of the represented value (*) 758 * DOUBLE - signed comparison of the represented value (*) 759 * BYTE_ARRAY - unsigned byte-wise comparison 760 * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison 761 * 762 * (*) Because the sorting order is not specified properly for floating 763 * point values (relations vs. total ordering) the following 764 * compatibility rules should be applied when reading statistics: 765 * - If the min is a NaN, it should be ignored. 766 * - If the max is a NaN, it should be ignored. 767 * - If the min is +0, the row group may contain -0 values as well. 768 * - If the max is -0, the row group may contain +0 values as well. 769 * - When looking for NaN values, min and max should be ignored. 770 */ 771 1: TypeDefinedOrder TYPE_ORDER; 772 } 773 774 struct PageLocation { 775 /** Offset of the page in the file **/ 776 1: required i64 offset 777 778 /** 779 * Size of the page, including header. Sum of compressed_page_size and header 780 * length 781 */ 782 2: required i32 compressed_page_size 783 784 /** 785 * Index within the RowGroup of the first row of the page; this means pages 786 * change on record boundaries (r = 0). 787 */ 788 3: required i64 first_row_index 789 } 790 791 struct OffsetIndex { 792 /** 793 * PageLocations, ordered by increasing PageLocation.offset. It is required 794 * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. 795 */ 796 1: required list<PageLocation> page_locations 797 } 798 799 /** 800 * Description for ColumnIndex. 801 * Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i] 802 */ 803 struct ColumnIndex { 804 /** 805 * A list of Boolean values to determine the validity of the corresponding 806 * min and max values. If true, a page contains only null values, and writers 807 * have to set the corresponding entries in min_values and max_values to 808 * byte[0], so that all lists have the same length. If false, the 809 * corresponding entries in min_values and max_values must be valid. 810 */ 811 1: required list<bool> null_pages 812 813 /** 814 * Two lists containing lower and upper bounds for the values of each page. 815 * These may be the actual minimum and maximum values found on a page, but 816 * can also be (more compact) values that do not exist on a page. For 817 * example, instead of storing ""Blart Versenwald III", a writer may set 818 * min_values[i]="B", max_values[i]="C". Such more compact values must still 819 * be valid values within the column's logical type. Readers must make sure 820 * that list entries are populated before using them by inspecting null_pages. 821 */ 822 2: required list<binary> min_values 823 3: required list<binary> max_values 824 825 /** 826 * Stores whether both min_values and max_values are orderd and if so, in 827 * which direction. This allows readers to perform binary searches in both 828 * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even 829 * if the lists are ordered. 830 */ 831 4: required BoundaryOrder boundary_order 832 833 /** A list containing the number of null values for each page **/ 834 5: optional list<i64> null_counts 835 } 836 837 /** 838 * Description for file metadata 839 */ 840 struct FileMetaData { 841 /** Version of this file **/ 842 1: required i32 version 843 844 /** Parquet schema for this file. This schema contains metadata for all the columns. 845 * The schema is represented as a tree with a single root. The nodes of the tree 846 * are flattened to a list by doing a depth-first traversal. 847 * The column metadata contains the path in the schema for that column which can be 848 * used to map columns to nodes in the schema. 849 * The first element is the root **/ 850 2: required list<SchemaElement> schema; 851 852 /** Number of rows in this file **/ 853 3: required i64 num_rows 854 855 /** Row groups in this file **/ 856 4: required list<RowGroup> row_groups 857 858 /** Optional key/value metadata **/ 859 5: optional list<KeyValue> key_value_metadata 860 861 /** String for application that wrote this file. This should be in the format 862 * <Application> version <App Version> (build <App Build Hash>). 863 * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) 864 **/ 865 6: optional string created_by 866 867 /** 868 * Sort order used for the min_value and max_value fields of each column in 869 * this file. Each sort order corresponds to one column, determined by its 870 * position in the list, matching the position of the column in the schema. 871 * 872 * Without column_orders, the meaning of the min_value and max_value fields is 873 * undefined. To ensure well-defined behavior, if min_value and max_value are 874 * written to a Parquet file, column_orders must be written as well. 875 * 876 * The obsolete min and max fields are always sorted by signed comparison 877 * regardless of column_orders. 878 */ 879 7: optional list<ColumnOrder> column_orders; 880 } 881