github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/format/parquet.go (about) 1 package format 2 3 import ( 4 "fmt" 5 6 "github.com/parquet-go/parquet-go/deprecated" 7 ) 8 9 // Types supported by Parquet. These types are intended to be used in combination 10 // with the encodings to control the on disk storage format. For example INT16 11 // is not included as a type since a good encoding of INT32 would handle this. 12 type Type int32 13 14 const ( 15 Boolean Type = 0 16 Int32 Type = 1 17 Int64 Type = 2 18 Int96 Type = 3 // deprecated, only used by legacy implementations. 19 Float Type = 4 20 Double Type = 5 21 ByteArray Type = 6 22 FixedLenByteArray Type = 7 23 ) 24 25 func (t Type) String() string { 26 switch t { 27 case Boolean: 28 return "BOOLEAN" 29 case Int32: 30 return "INT32" 31 case Int64: 32 return "INT64" 33 case Int96: 34 return "INT96" 35 case Float: 36 return "FLOAT" 37 case Double: 38 return "DOUBLE" 39 case ByteArray: 40 return "BYTE_ARRAY" 41 case FixedLenByteArray: 42 return "FIXED_LEN_BYTE_ARRAY" 43 default: 44 return "Type(?)" 45 } 46 } 47 48 // Representation of Schemas. 49 type FieldRepetitionType int32 50 51 const ( 52 // The field is required (can not be null) and each record has exactly 1 value. 53 Required FieldRepetitionType = 0 54 // The field is optional (can be null) and each record has 0 or 1 values. 55 Optional FieldRepetitionType = 1 56 // The field is repeated and can contain 0 or more values. 57 Repeated FieldRepetitionType = 2 58 ) 59 60 func (t FieldRepetitionType) String() string { 61 switch t { 62 case Required: 63 return "REQUIRED" 64 case Optional: 65 return "OPTIONAL" 66 case Repeated: 67 return "REPEATED" 68 default: 69 return "FieldRepeationaType(?)" 70 } 71 } 72 73 // Statistics per row group and per page. 74 // All fields are optional. 75 type Statistics struct { 76 // DEPRECATED: min and max value of the column. Use min_value and max_value. 77 // 78 // Values are encoded using PLAIN encoding, except that variable-length byte 79 // arrays do not include a length prefix. 80 // 81 // These fields encode min and max values determined by signed comparison 82 // only. New files should use the correct order for a column's logical type 83 // and store the values in the min_value and max_value fields. 84 // 85 // To support older readers, these may be set when the column order is 86 // signed. 87 Max []byte `thrift:"1"` 88 Min []byte `thrift:"2"` 89 // Count of null value in the column. 90 NullCount int64 `thrift:"3"` 91 // Count of distinct values occurring. 92 DistinctCount int64 `thrift:"4"` 93 // Min and max values for the column, determined by its ColumnOrder. 94 // 95 // Values are encoded using PLAIN encoding, except that variable-length byte 96 // arrays do not include a length prefix. 97 MaxValue []byte `thrift:"5"` 98 MinValue []byte `thrift:"6"` 99 } 100 101 // Empty structs to use as logical type annotations. 102 type StringType struct{} // allowed for BINARY, must be encoded with UTF-8 103 type UUIDType struct{} // allowed for FIXED[16], must encode raw UUID bytes 104 type MapType struct{} // see see LogicalTypes.md 105 type ListType struct{} // see LogicalTypes.md 106 type EnumType struct{} // allowed for BINARY, must be encoded with UTF-8 107 type DateType struct{} // allowed for INT32 108 109 func (*StringType) String() string { return "STRING" } 110 func (*UUIDType) String() string { return "UUID" } 111 func (*MapType) String() string { return "MAP" } 112 func (*ListType) String() string { return "LIST" } 113 func (*EnumType) String() string { return "ENUM" } 114 func (*DateType) String() string { return "DATE" } 115 116 // Logical type to annotate a column that is always null. 117 // 118 // Sometimes when discovering the schema of existing data, values are always 119 // null and the physical type can't be determined. This annotation signals 120 // the case where the physical type was guessed from all null values. 121 type NullType struct{} 122 123 func (*NullType) String() string { return "NULL" } 124 125 // Decimal logical type annotation 126 // 127 // To maintain forward-compatibility in v1, implementations using this logical 128 // type must also set scale and precision on the annotated SchemaElement. 129 // 130 // Allowed for physical types: INT32, INT64, FIXED, and BINARY 131 type DecimalType struct { 132 Scale int32 `thrift:"1,required"` 133 Precision int32 `thrift:"2,required"` 134 } 135 136 func (t *DecimalType) String() string { 137 // Matching parquet-cli's decimal string format: https://github.com/apache/parquet-mr/blob/d057b39d93014fe40f5067ee4a33621e65c91552/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java#L249-L265 138 return fmt.Sprintf("DECIMAL(%d,%d)", t.Precision, t.Scale) 139 } 140 141 // Time units for logical types. 142 type MilliSeconds struct{} 143 type MicroSeconds struct{} 144 type NanoSeconds struct{} 145 146 func (*MilliSeconds) String() string { return "MILLIS" } 147 func (*MicroSeconds) String() string { return "MICROS" } 148 func (*NanoSeconds) String() string { return "NANOS" } 149 150 type TimeUnit struct { // union 151 Millis *MilliSeconds `thrift:"1"` 152 Micros *MicroSeconds `thrift:"2"` 153 Nanos *NanoSeconds `thrift:"3"` 154 } 155 156 func (u *TimeUnit) String() string { 157 switch { 158 case u.Millis != nil: 159 return u.Millis.String() 160 case u.Micros != nil: 161 return u.Micros.String() 162 case u.Nanos != nil: 163 return u.Nanos.String() 164 default: 165 return "" 166 } 167 } 168 169 // Timestamp logical type annotation 170 // 171 // Allowed for physical types: INT64 172 type TimestampType struct { 173 IsAdjustedToUTC bool `thrift:"1,required"` 174 Unit TimeUnit `thrift:"2,required"` 175 } 176 177 func (t *TimestampType) String() string { 178 return fmt.Sprintf("TIMESTAMP(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit) 179 } 180 181 // Time logical type annotation 182 // 183 // Allowed for physical types: INT32 (millis), INT64 (micros, nanos) 184 type TimeType struct { 185 IsAdjustedToUTC bool `thrift:"1,required"` 186 Unit TimeUnit `thrift:"2,required"` 187 } 188 189 func (t *TimeType) String() string { 190 return fmt.Sprintf("TIME(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit) 191 } 192 193 // Integer logical type annotation 194 // 195 // bitWidth must be 8, 16, 32, or 64. 196 // 197 // Allowed for physical types: INT32, INT64 198 type IntType struct { 199 BitWidth int8 `thrift:"1,required"` 200 IsSigned bool `thrift:"2,required"` 201 } 202 203 func (t *IntType) String() string { 204 return fmt.Sprintf("INT(%d,%t)", t.BitWidth, t.IsSigned) 205 } 206 207 // Embedded JSON logical type annotation 208 // 209 // Allowed for physical types: BINARY 210 type JsonType struct{} 211 212 func (t *JsonType) String() string { return "JSON" } 213 214 // Embedded BSON logical type annotation 215 // 216 // Allowed for physical types: BINARY 217 type BsonType struct{} 218 219 func (t *BsonType) String() string { return "BSON" } 220 221 // LogicalType annotations to replace ConvertedType. 222 // 223 // To maintain compatibility, implementations using LogicalType for a 224 // SchemaElement must also set the corresponding ConvertedType (if any) 225 // from the following table. 226 type LogicalType struct { // union 227 UTF8 *StringType `thrift:"1"` // use ConvertedType UTF8 228 Map *MapType `thrift:"2"` // use ConvertedType Map 229 List *ListType `thrift:"3"` // use ConvertedType List 230 Enum *EnumType `thrift:"4"` // use ConvertedType Enum 231 Decimal *DecimalType `thrift:"5"` // use ConvertedType Decimal + SchemaElement.{Scale, Precision} 232 Date *DateType `thrift:"6"` // use ConvertedType Date 233 234 // use ConvertedType TimeMicros for Time{IsAdjustedToUTC: *, Unit: Micros} 235 // use ConvertedType TimeMillis for Time{IsAdjustedToUTC: *, Unit: Millis} 236 Time *TimeType `thrift:"7"` 237 238 // use ConvertedType TimestampMicros for Timestamp{IsAdjustedToUTC: *, Unit: Micros} 239 // use ConvertedType TimestampMillis for Timestamp{IsAdjustedToUTC: *, Unit: Millis} 240 Timestamp *TimestampType `thrift:"8"` 241 242 // 9: reserved for Interval 243 Integer *IntType `thrift:"10"` // use ConvertedType Int* or Uint* 244 Unknown *NullType `thrift:"11"` // no compatible ConvertedType 245 Json *JsonType `thrift:"12"` // use ConvertedType JSON 246 Bson *BsonType `thrift:"13"` // use ConvertedType BSON 247 UUID *UUIDType `thrift:"14"` // no compatible ConvertedType 248 } 249 250 func (t *LogicalType) String() string { 251 switch { 252 case t.UTF8 != nil: 253 return t.UTF8.String() 254 case t.Map != nil: 255 return t.Map.String() 256 case t.List != nil: 257 return t.List.String() 258 case t.Enum != nil: 259 return t.Enum.String() 260 case t.Decimal != nil: 261 return t.Decimal.String() 262 case t.Date != nil: 263 return t.Date.String() 264 case t.Time != nil: 265 return t.Time.String() 266 case t.Timestamp != nil: 267 return t.Timestamp.String() 268 case t.Integer != nil: 269 return t.Integer.String() 270 case t.Unknown != nil: 271 return t.Unknown.String() 272 case t.Json != nil: 273 return t.Json.String() 274 case t.Bson != nil: 275 return t.Bson.String() 276 case t.UUID != nil: 277 return t.UUID.String() 278 default: 279 return "" 280 } 281 } 282 283 // Represents a element inside a schema definition. 284 // 285 // - if it is a group (inner node) then type is undefined and num_children is 286 // defined 287 // 288 // - if it is a primitive type (leaf) then type is defined and num_children is 289 // undefined 290 // 291 // The nodes are listed in depth first traversal order. 292 type SchemaElement struct { 293 // Data type for this field. Not set if the current element is a non-leaf node. 294 Type *Type `thrift:"1,optional"` 295 296 // If type is FixedLenByteArray, this is the byte length of the values. 297 // Otherwise, if specified, this is the maximum bit length to store any of the values. 298 // (e.g. a low cardinality INT col could have this set to 3). Note that this is 299 // in the schema, and therefore fixed for the entire file. 300 TypeLength *int32 `thrift:"2,optional"` 301 302 // repetition of the field. The root of the schema does not have a repetition_type. 303 // All other nodes must have one. 304 RepetitionType *FieldRepetitionType `thrift:"3,optional"` 305 306 // Name of the field in the schema. 307 Name string `thrift:"4,required"` 308 309 // Nested fields. Since thrift does not support nested fields, 310 // the nesting is flattened to a single list by a depth-first traversal. 311 // The children count is used to construct the nested relationship. 312 // This field is not set when the element is a primitive type 313 NumChildren int32 `thrift:"5,optional"` 314 315 // DEPRECATED: When the schema is the result of a conversion from another model. 316 // Used to record the original type to help with cross conversion. 317 // 318 // This is superseded by logicalType. 319 ConvertedType *deprecated.ConvertedType `thrift:"6,optional"` 320 321 // DEPRECATED: Used when this column contains decimal data. 322 // See the DECIMAL converted type for more details. 323 // 324 // This is superseded by using the DecimalType annotation in logicalType. 325 Scale *int32 `thrift:"7,optional"` 326 Precision *int32 `thrift:"8,optional"` 327 328 // When the original schema supports field ids, this will save the 329 // original field id in the parquet schema. 330 FieldID int32 `thrift:"9,optional"` 331 332 // The logical type of this SchemaElement 333 // 334 // LogicalType replaces ConvertedType, but ConvertedType is still required 335 // for some logical types to ensure forward-compatibility in format v1. 336 LogicalType *LogicalType `thrift:"10,optional"` 337 } 338 339 // Encodings supported by Parquet. Not all encodings are valid for all types. 340 // These enums are also used to specify the encoding of definition and 341 // repetition levels. See the accompanying doc for the details of the more 342 // complicated encodings. 343 type Encoding int32 344 345 const ( 346 // Default encoding. 347 // Boolean - 1 bit per value. 0 is false; 1 is true. 348 // Int32 - 4 bytes per value. Stored as little-endian. 349 // Int64 - 8 bytes per value. Stored as little-endian. 350 // Float - 4 bytes per value. IEEE. Stored as little-endian. 351 // Double - 8 bytes per value. IEEE. Stored as little-endian. 352 // ByteArray - 4 byte length stored as little endian, followed by bytes. 353 // FixedLenByteArray - Just the bytes. 354 Plain Encoding = 0 355 356 // Group VarInt encoding for Int32/Int64. 357 // This encoding is deprecated. It was never used. 358 // GroupVarInt Encoding = 1 359 360 // Deprecated: Dictionary encoding. The values in the dictionary are encoded 361 // in the plain type. 362 // In a data page use RLEDictionary instead. 363 // In a Dictionary page use Plain instead. 364 PlainDictionary Encoding = 2 365 366 // Group packed run length encoding. Usable for definition/repetition levels 367 // encoding and Booleans (on one bit: 0 is false 1 is true.) 368 RLE Encoding = 3 369 370 // Bit packed encoding. This can only be used if the data has a known max 371 // width. Usable for definition/repetition levels encoding. 372 BitPacked Encoding = 4 373 374 // Delta encoding for integers. This can be used for int columns and works best 375 // on sorted data. 376 DeltaBinaryPacked Encoding = 5 377 378 // Encoding for byte arrays to separate the length values and the data. 379 // The lengths are encoded using DeltaBinaryPacked. 380 DeltaLengthByteArray Encoding = 6 381 382 // Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. 383 // Suffixes are stored as delta length byte arrays. 384 DeltaByteArray Encoding = 7 385 386 // Dictionary encoding: the ids are encoded using the RLE encoding 387 RLEDictionary Encoding = 8 388 389 // Encoding for floating-point data. 390 // K byte-streams are created where K is the size in bytes of the data type. 391 // The individual bytes of an FP value are scattered to the corresponding stream and 392 // the streams are concatenated. 393 // This itself does not reduce the size of the data but can lead to better compression 394 // afterwards. 395 ByteStreamSplit Encoding = 9 396 ) 397 398 func (e Encoding) String() string { 399 switch e { 400 case Plain: 401 return "PLAIN" 402 case PlainDictionary: 403 return "PLAIN_DICTIONARY" 404 case RLE: 405 return "RLE" 406 case BitPacked: 407 return "BIT_PACKED" 408 case DeltaBinaryPacked: 409 return "DELTA_BINARY_PACKED" 410 case DeltaLengthByteArray: 411 return "DELTA_LENGTH_BYTE_ARRAY" 412 case DeltaByteArray: 413 return "DELTA_BYTE_ARRAY" 414 case RLEDictionary: 415 return "RLE_DICTIONARY" 416 case ByteStreamSplit: 417 return "BYTE_STREAM_SPLIT" 418 default: 419 return "Encoding(?)" 420 } 421 } 422 423 // Supported compression algorithms. 424 // 425 // Codecs added in format version X.Y can be read by readers based on X.Y and later. 426 // Codec support may vary between readers based on the format version and 427 // libraries available at runtime. 428 // 429 // See Compression.md for a detailed specification of these algorithms. 430 type CompressionCodec int32 431 432 const ( 433 Uncompressed CompressionCodec = 0 434 Snappy CompressionCodec = 1 435 Gzip CompressionCodec = 2 436 LZO CompressionCodec = 3 437 Brotli CompressionCodec = 4 // Added in 2.4 438 Lz4 CompressionCodec = 5 // DEPRECATED (Added in 2.4) 439 Zstd CompressionCodec = 6 // Added in 2.4 440 Lz4Raw CompressionCodec = 7 // Added in 2.9 441 ) 442 443 func (c CompressionCodec) String() string { 444 switch c { 445 case Uncompressed: 446 return "UNCOMPRESSED" 447 case Snappy: 448 return "SNAPPY" 449 case Gzip: 450 return "GZIP" 451 case LZO: 452 return "LZO" 453 case Brotli: 454 return "BROTLI" 455 case Lz4: 456 return "LZ4" 457 case Zstd: 458 return "ZSTD" 459 case Lz4Raw: 460 return "LZ4_RAW" 461 default: 462 return "CompressionCodec(?)" 463 } 464 } 465 466 type PageType int32 467 468 const ( 469 DataPage PageType = 0 470 IndexPage PageType = 1 471 DictionaryPage PageType = 2 472 // Version 2 is indicated in the PageHeader and the use of DataPageHeaderV2, 473 // and allows you to read repetition and definition level data without 474 // decompressing the Page. 475 DataPageV2 PageType = 3 476 ) 477 478 func (p PageType) String() string { 479 switch p { 480 case DataPage: 481 return "DATA_PAGE" 482 case IndexPage: 483 return "INDEX_PAGE" 484 case DictionaryPage: 485 return "DICTIONARY_PAGE" 486 case DataPageV2: 487 return "DATA_PAGE_V2" 488 default: 489 return "PageType(?)" 490 } 491 } 492 493 // Enum to annotate whether lists of min/max elements inside ColumnIndex 494 // are ordered and if so, in which direction. 495 type BoundaryOrder int32 496 497 const ( 498 Unordered BoundaryOrder = 0 499 Ascending BoundaryOrder = 1 500 Descending BoundaryOrder = 2 501 ) 502 503 func (b BoundaryOrder) String() string { 504 switch b { 505 case Unordered: 506 return "UNORDERED" 507 case Ascending: 508 return "ASCENDING" 509 case Descending: 510 return "DESCENDING" 511 default: 512 return "BoundaryOrder(?)" 513 } 514 } 515 516 // Data page header. 517 type DataPageHeader struct { 518 // Number of values, including NULLs, in this data page. 519 NumValues int32 `thrift:"1,required"` 520 521 // Encoding used for this data page. 522 Encoding Encoding `thrift:"2,required"` 523 524 // Encoding used for definition levels. 525 DefinitionLevelEncoding Encoding `thrift:"3,required"` 526 527 // Encoding used for repetition levels. 528 RepetitionLevelEncoding Encoding `thrift:"4,required"` 529 530 // Optional statistics for the data in this page. 531 Statistics Statistics `thrift:"5,optional"` 532 } 533 534 type IndexPageHeader struct { 535 // TODO 536 } 537 538 // The dictionary page must be placed at the first position of the column chunk 539 // if it is partly or completely dictionary encoded. At most one dictionary page 540 // can be placed in a column chunk. 541 type DictionaryPageHeader struct { 542 // Number of values in the dictionary. 543 NumValues int32 `thrift:"1,required"` 544 545 // Encoding using this dictionary page. 546 Encoding Encoding `thrift:"2,required"` 547 548 // If true, the entries in the dictionary are sorted in ascending order. 549 IsSorted bool `thrift:"3,optional"` 550 } 551 552 // New page format allowing reading levels without decompressing the data 553 // Repetition and definition levels are uncompressed 554 // The remaining section containing the data is compressed if is_compressed is 555 // true. 556 type DataPageHeaderV2 struct { 557 // Number of values, including NULLs, in this data page. 558 NumValues int32 `thrift:"1,required"` 559 // Number of NULL values, in this data page. 560 // Number of non-null = num_values - num_nulls which is also the number of 561 // values in the data section. 562 NumNulls int32 `thrift:"2,required"` 563 // Number of rows in this data page. which means pages change on record boundaries (r = 0). 564 NumRows int32 `thrift:"3,required"` 565 // Encoding used for data in this page. 566 Encoding Encoding `thrift:"4,required"` 567 568 // Repetition levels and definition levels are always using RLE (without size in it). 569 570 // Length of the definition levels. 571 DefinitionLevelsByteLength int32 `thrift:"5,required"` 572 // Length of the repetition levels. 573 RepetitionLevelsByteLength int32 `thrift:"6,required"` 574 575 // Whether the values are compressed. 576 // Which means the section of the page between 577 // definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) 578 // is compressed with the compression_codec. 579 // If missing it is considered compressed. 580 IsCompressed *bool `thrift:"7,optional"` 581 582 // Optional statistics for the data in this page. 583 Statistics Statistics `thrift:"8,optional"` 584 } 585 586 // Block-based algorithm type annotation. 587 type SplitBlockAlgorithm struct{} 588 589 // The algorithm used in Bloom filter. 590 type BloomFilterAlgorithm struct { // union 591 Block *SplitBlockAlgorithm `thrift:"1"` 592 } 593 594 // Hash strategy type annotation. xxHash is an extremely fast non-cryptographic 595 // hash algorithm. It uses 64 bits version of xxHash. 596 type XxHash struct{} 597 598 // The hash function used in Bloom filter. This function takes the hash of a 599 // column value using plain encoding. 600 type BloomFilterHash struct { // union 601 XxHash *XxHash `thrift:"1"` 602 } 603 604 // The compression used in the Bloom filter. 605 type BloomFilterUncompressed struct{} 606 type BloomFilterCompression struct { // union 607 Uncompressed *BloomFilterUncompressed `thrift:"1"` 608 } 609 610 // Bloom filter header is stored at beginning of Bloom filter data of each column 611 // and followed by its bitset. 612 type BloomFilterHeader struct { 613 // The size of bitset in bytes. 614 NumBytes int32 `thrift:"1,required"` 615 // The algorithm for setting bits. 616 Algorithm BloomFilterAlgorithm `thrift:"2,required"` 617 // The hash function used for Bloom filter. 618 Hash BloomFilterHash `thrift:"3,required"` 619 // The compression used in the Bloom filter. 620 Compression BloomFilterCompression `thrift:"4,required"` 621 } 622 623 type PageHeader struct { 624 // The type of the page indicates which of the *Header fields below is set. 625 Type PageType `thrift:"1,required"` 626 627 // Uncompressed page size in bytes (not including this header). 628 UncompressedPageSize int32 `thrift:"2,required"` 629 630 // Compressed (and potentially encrypted) page size in bytes, not including 631 // this header. 632 CompressedPageSize int32 `thrift:"3,required"` 633 634 // The 32bit CRC for the page, to be be calculated as follows: 635 // - Using the standard CRC32 algorithm 636 // - On the data only, i.e. this header should not be included. 'Data' 637 // hereby refers to the concatenation of the repetition levels, the 638 // definition levels and the column value, in this exact order. 639 // - On the encoded versions of the repetition levels, definition levels and 640 // column values. 641 // - On the compressed versions of the repetition levels, definition levels 642 // and column values where possible; 643 // - For v1 data pages, the repetition levels, definition levels and column 644 // values are always compressed together. If a compression scheme is 645 // specified, the CRC shall be calculated on the compressed version of 646 // this concatenation. If no compression scheme is specified, the CRC 647 // shall be calculated on the uncompressed version of this concatenation. 648 // - For v2 data pages, the repetition levels and definition levels are 649 // handled separately from the data and are never compressed (only 650 // encoded). If a compression scheme is specified, the CRC shall be 651 // calculated on the concatenation of the uncompressed repetition levels, 652 // uncompressed definition levels and the compressed column values. 653 // If no compression scheme is specified, the CRC shall be calculated on 654 // the uncompressed concatenation. 655 // - In encrypted columns, CRC is calculated after page encryption; the 656 // encryption itself is performed after page compression (if compressed) 657 // If enabled, this allows for disabling checksumming in HDFS if only a few 658 // pages need to be read. 659 CRC int32 `thrift:"4,optional"` 660 661 // Headers for page specific data. One only will be set. 662 DataPageHeader *DataPageHeader `thrift:"5,optional"` 663 IndexPageHeader *IndexPageHeader `thrift:"6,optional"` 664 DictionaryPageHeader *DictionaryPageHeader `thrift:"7,optional"` 665 DataPageHeaderV2 *DataPageHeaderV2 `thrift:"8,optional"` 666 } 667 668 // Wrapper struct to store key values. 669 type KeyValue struct { 670 Key string `thrift:"1,required"` 671 Value string `thrift:"2,required"` 672 } 673 674 // Wrapper struct to specify sort order. 675 type SortingColumn struct { 676 // The column index (in this row group) 677 ColumnIdx int32 `thrift:"1,required"` 678 679 // If true, indicates this column is sorted in descending order. 680 Descending bool `thrift:"2,required"` 681 682 // If true, nulls will come before non-null values, otherwise, 683 // nulls go at the end. 684 NullsFirst bool `thrift:"3,required"` 685 } 686 687 // Statistics of a given page type and encoding. 688 type PageEncodingStats struct { 689 // The page type (data/dic/...). 690 PageType PageType `thrift:"1,required"` 691 692 // Encoding of the page. 693 Encoding Encoding `thrift:"2,required"` 694 695 // Number of pages of this type with this encoding. 696 Count int32 `thrift:"3,required"` 697 } 698 699 // Description for column metadata. 700 type ColumnMetaData struct { 701 // Type of this column. 702 Type Type `thrift:"1,required"` 703 704 // Set of all encodings used for this column. The purpose is to validate 705 // whether we can decode those pages. 706 Encoding []Encoding `thrift:"2,required"` 707 708 // Path in schema. 709 PathInSchema []string `thrift:"3,required"` 710 711 // Compression codec. 712 Codec CompressionCodec `thrift:"4,required"` 713 714 // Number of values in this column. 715 NumValues int64 `thrift:"5,required"` 716 717 // Total byte size of all uncompressed pages in this column chunk (including the headers). 718 TotalUncompressedSize int64 `thrift:"6,required"` 719 720 // Total byte size of all compressed, and potentially encrypted, pages 721 // in this column chunk (including the headers). 722 TotalCompressedSize int64 `thrift:"7,required"` 723 724 // Optional key/value metadata. 725 KeyValueMetadata []KeyValue `thrift:"8,optional"` 726 727 // Byte offset from beginning of file to first data page. 728 DataPageOffset int64 `thrift:"9,required"` 729 730 // Byte offset from beginning of file to root index page. 731 IndexPageOffset int64 `thrift:"10,optional"` 732 733 // Byte offset from the beginning of file to first (only) dictionary page. 734 DictionaryPageOffset int64 `thrift:"11,optional"` 735 736 // optional statistics for this column chunk. 737 Statistics Statistics `thrift:"12,optional"` 738 739 // Set of all encodings used for pages in this column chunk. 740 // This information can be used to determine if all data pages are 741 // dictionary encoded for example. 742 EncodingStats []PageEncodingStats `thrift:"13,optional"` 743 744 // Byte offset from beginning of file to Bloom filter data. 745 BloomFilterOffset int64 `thrift:"14,optional"` 746 } 747 748 type EncryptionWithFooterKey struct{} 749 750 type EncryptionWithColumnKey struct { 751 // Column path in schema. 752 PathInSchema []string `thrift:"1,required"` 753 754 // Retrieval metadata of column encryption key. 755 KeyMetadata []byte `thrift:"2,optional"` 756 } 757 758 type ColumnCryptoMetaData struct { 759 EncryptionWithFooterKey *EncryptionWithFooterKey `thrift:"1"` 760 EncryptionWithColumnKey *EncryptionWithColumnKey `thrift:"2"` 761 } 762 763 type ColumnChunk struct { 764 // File where column data is stored. If not set, assumed to be same file as 765 // metadata. This path is relative to the current file. 766 FilePath string `thrift:"1,optional"` 767 768 // Byte offset in file_path to the ColumnMetaData. 769 FileOffset int64 `thrift:"2,required"` 770 771 // Column metadata for this chunk. This is the same content as what is at 772 // file_path/file_offset. Having it here has it replicated in the file 773 // metadata. 774 MetaData ColumnMetaData `thrift:"3,optional"` 775 776 // File offset of ColumnChunk's OffsetIndex. 777 OffsetIndexOffset int64 `thrift:"4,optional"` 778 779 // Size of ColumnChunk's OffsetIndex, in bytes. 780 OffsetIndexLength int32 `thrift:"5,optional"` 781 782 // File offset of ColumnChunk's ColumnIndex. 783 ColumnIndexOffset int64 `thrift:"6,optional"` 784 785 // Size of ColumnChunk's ColumnIndex, in bytes. 786 ColumnIndexLength int32 `thrift:"7,optional"` 787 788 // Crypto metadata of encrypted columns. 789 CryptoMetadata ColumnCryptoMetaData `thrift:"8,optional"` 790 791 // Encrypted column metadata for this chunk. 792 EncryptedColumnMetadata []byte `thrift:"9,optional"` 793 } 794 795 type RowGroup struct { 796 // Metadata for each column chunk in this row group. 797 // This list must have the same order as the SchemaElement list in FileMetaData. 798 Columns []ColumnChunk `thrift:"1,required"` 799 800 // Total byte size of all the uncompressed column data in this row group. 801 TotalByteSize int64 `thrift:"2,required"` 802 803 // Number of rows in this row group. 804 NumRows int64 `thrift:"3,required"` 805 806 // If set, specifies a sort ordering of the rows in this RowGroup. 807 // The sorting columns can be a subset of all the columns. 808 SortingColumns []SortingColumn `thrift:"4,optional"` 809 810 // Byte offset from beginning of file to first page (data or dictionary) 811 // in this row group 812 FileOffset int64 `thrift:"5,optional"` 813 814 // Total byte size of all compressed (and potentially encrypted) column data 815 // in this row group. 816 TotalCompressedSize int64 `thrift:"6,optional"` 817 818 // Row group ordinal in the file. 819 Ordinal int16 `thrift:"7,optional"` 820 } 821 822 // Empty struct to signal the order defined by the physical or logical type. 823 type TypeDefinedOrder struct{} 824 825 // Union to specify the order used for the min_value and max_value fields for a 826 // column. This union takes the role of an enhanced enum that allows rich 827 // elements (which will be needed for a collation-based ordering in the future). 828 // 829 // Possible values are: 830 // 831 // TypeDefinedOrder - the column uses the order defined by its logical or 832 // physical type (if there is no logical type). 833 // 834 // If the reader does not support the value of this union, min and max stats 835 // for this column should be ignored. 836 type ColumnOrder struct { // union 837 // The sort orders for logical types are: 838 // UTF8 - unsigned byte-wise comparison 839 // INT8 - signed comparison 840 // INT16 - signed comparison 841 // INT32 - signed comparison 842 // INT64 - signed comparison 843 // UINT8 - unsigned comparison 844 // UINT16 - unsigned comparison 845 // UINT32 - unsigned comparison 846 // UINT64 - unsigned comparison 847 // DECIMAL - signed comparison of the represented value 848 // DATE - signed comparison 849 // TIME_MILLIS - signed comparison 850 // TIME_MICROS - signed comparison 851 // TIMESTAMP_MILLIS - signed comparison 852 // TIMESTAMP_MICROS - signed comparison 853 // INTERVAL - unsigned comparison 854 // JSON - unsigned byte-wise comparison 855 // BSON - unsigned byte-wise comparison 856 // ENUM - unsigned byte-wise comparison 857 // LIST - undefined 858 // MAP - undefined 859 // 860 // In the absence of logical types, the sort order is determined by the physical type: 861 // BOOLEAN - false, true 862 // INT32 - signed comparison 863 // INT64 - signed comparison 864 // INT96 (only used for legacy timestamps) - undefined 865 // FLOAT - signed comparison of the represented value (*) 866 // DOUBLE - signed comparison of the represented value (*) 867 // BYTE_ARRAY - unsigned byte-wise comparison 868 // FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison 869 // 870 // (*) Because the sorting order is not specified properly for floating 871 // point values (relations vs. total ordering) the following 872 // compatibility rules should be applied when reading statistics: 873 // - If the min is a NaN, it should be ignored. 874 // - If the max is a NaN, it should be ignored. 875 // - If the min is +0, the row group may contain -0 values as well. 876 // - If the max is -0, the row group may contain +0 values as well. 877 // - When looking for NaN values, min and max should be ignored. 878 TypeOrder *TypeDefinedOrder `thrift:"1"` 879 } 880 881 type PageLocation struct { 882 // Offset of the page in the file. 883 Offset int64 `thrift:"1,required"` 884 885 // Size of the page, including header. Sum of compressed_page_size and 886 // header length. 887 CompressedPageSize int32 `thrift:"2,required"` 888 889 // Index within the RowGroup of the first row of the page; this means 890 // pages change on record boundaries (r = 0). 891 FirstRowIndex int64 `thrift:"3,required"` 892 } 893 894 type OffsetIndex struct { 895 // PageLocations, ordered by increasing PageLocation.offset. It is required 896 // that page_locations[i].first_row_index < page_locations[i+1].first_row_index. 897 PageLocations []PageLocation `thrift:"1,required"` 898 } 899 900 // Description for ColumnIndex. 901 // Each <array-field>[i] refers to the page at OffsetIndex.PageLocations[i] 902 type ColumnIndex struct { 903 // A list of Boolean values to determine the validity of the corresponding 904 // min and max values. If true, a page contains only null values, and writers 905 // have to set the corresponding entries in min_values and max_values to 906 // byte[0], so that all lists have the same length. If false, the 907 // corresponding entries in min_values and max_values must be valid. 908 NullPages []bool `thrift:"1,required"` 909 910 // Two lists containing lower and upper bounds for the values of each page 911 // determined by the ColumnOrder of the column. These may be the actual 912 // minimum and maximum values found on a page, but can also be (more compact) 913 // values that do not exist on a page. For example, instead of storing ""Blart 914 // Versenwald III", a writer may set min_values[i]="B", max_values[i]="C". 915 // Such more compact values must still be valid values within the column's 916 // logical type. Readers must make sure that list entries are populated before 917 // using them by inspecting null_pages. 918 MinValues [][]byte `thrift:"2,required"` 919 MaxValues [][]byte `thrift:"3,required"` 920 921 // Stores whether both min_values and max_values are ordered and if so, in 922 // which direction. This allows readers to perform binary searches in both 923 // lists. Readers cannot assume that max_values[i] <= min_values[i+1], even 924 // if the lists are ordered. 925 BoundaryOrder BoundaryOrder `thrift:"4,required"` 926 927 // A list containing the number of null values for each page. 928 NullCounts []int64 `thrift:"5,optional"` 929 } 930 931 type AesGcmV1 struct { 932 // AAD prefix. 933 AadPrefix []byte `thrift:"1,optional"` 934 935 // Unique file identifier part of AAD suffix. 936 AadFileUnique []byte `thrift:"2,optional"` 937 938 // In files encrypted with AAD prefix without storing it, 939 // readers must supply the prefix. 940 SupplyAadPrefix bool `thrift:"3,optional"` 941 } 942 943 type AesGcmCtrV1 struct { 944 // AAD prefix. 945 AadPrefix []byte `thrift:"1,optional"` 946 947 // Unique file identifier part of AAD suffix. 948 AadFileUnique []byte `thrift:"2,optional"` 949 950 // In files encrypted with AAD prefix without storing it, 951 // readers must supply the prefix. 952 SupplyAadPrefix bool `thrift:"3,optional"` 953 } 954 955 type EncryptionAlgorithm struct { // union 956 AesGcmV1 *AesGcmV1 `thrift:"1"` 957 AesGcmCtrV1 *AesGcmCtrV1 `thrift:"2"` 958 } 959 960 // Description for file metadata. 961 type FileMetaData struct { 962 // Version of this file. 963 Version int32 `thrift:"1,required"` 964 965 // Parquet schema for this file. This schema contains metadata for all the columns. 966 // The schema is represented as a tree with a single root. The nodes of the tree 967 // are flattened to a list by doing a depth-first traversal. 968 // The column metadata contains the path in the schema for that column which can be 969 // used to map columns to nodes in the schema. 970 // The first element is the root. 971 Schema []SchemaElement `thrift:"2,required"` 972 973 // Number of rows in this file. 974 NumRows int64 `thrift:"3,required"` 975 976 // Row groups in this file. 977 RowGroups []RowGroup `thrift:"4,required"` 978 979 // Optional key/value metadata. 980 KeyValueMetadata []KeyValue `thrift:"5,optional"` 981 982 // String for application that wrote this file. This should be in the format 983 // <Application> version <App Version> (build <App Build Hash>). 984 // e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) 985 CreatedBy string `thrift:"6,optional"` 986 987 // Sort order used for the min_value and max_value fields in the Statistics 988 // objects and the min_values and max_values fields in the ColumnIndex 989 // objects of each column in this file. Sort orders are listed in the order 990 // matching the columns in the schema. The indexes are not necessary the same 991 // though, because only leaf nodes of the schema are represented in the list 992 // of sort orders. 993 // 994 // Without column_orders, the meaning of the min_value and max_value fields 995 // in the Statistics object and the ColumnIndex object is undefined. To ensure 996 // well-defined behavior, if these fields are written to a Parquet file, 997 // column_orders must be written as well. 998 // 999 // The obsolete min and max fields in the Statistics object are always sorted 1000 // by signed comparison regardless of column_orders. 1001 ColumnOrders []ColumnOrder `thrift:"7,optional"` 1002 1003 // Encryption algorithm. This field is set only in encrypted files 1004 // with plaintext footer. Files with encrypted footer store algorithm id 1005 // in FileCryptoMetaData structure. 1006 EncryptionAlgorithm EncryptionAlgorithm `thrift:"8,optional"` 1007 1008 // Retrieval metadata of key used for signing the footer. 1009 // Used only in encrypted files with plaintext footer. 1010 FooterSigningKeyMetadata []byte `thrift:"9,optional"` 1011 } 1012 1013 // Crypto metadata for files with encrypted footer. 1014 type FileCryptoMetaData struct { 1015 // Encryption algorithm. This field is only used for files 1016 // with encrypted footer. Files with plaintext footer store algorithm id 1017 // inside footer (FileMetaData structure). 1018 EncryptionAlgorithm EncryptionAlgorithm `thrift:"1,required"` 1019 1020 // Retrieval metadata of key used for encryption of footer, 1021 // and (possibly) columns. 1022 KeyMetadata []byte `thrift:"2,optional"` 1023 }