github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/format/parquet.go (about) 1 package format 2 3 import ( 4 "fmt" 5 6 "github.com/vc42/parquet-go/deprecated" 7 ) 8 9 // Types supported by Parquet. These types are intended to be used in combination 10 // with the encodings to control the on disk storage format. For example INT16 11 // is not included as a type since a good encoding of INT32 would handle this. 12 type Type int32 13 14 const ( 15 Boolean Type = 0 16 Int32 Type = 1 17 Int64 Type = 2 18 Int96 Type = 3 // deprecated, only used by legacy implementations. 19 Float Type = 4 20 Double Type = 5 21 ByteArray Type = 6 22 FixedLenByteArray Type = 7 23 ) 24 25 func (t Type) String() string { 26 switch t { 27 case Boolean: 28 return "BOOLEAN" 29 case Int32: 30 return "INT32" 31 case Int64: 32 return "INT64" 33 case Int96: 34 return "INT96" 35 case Float: 36 return "FLOAT" 37 case Double: 38 return "DOUBLE" 39 case ByteArray: 40 return "BYTE_ARRAY" 41 case FixedLenByteArray: 42 return "FIXED_LEN_BYTE_ARRAY" 43 default: 44 return "Type(?)" 45 } 46 } 47 48 // Representation of Schemas. 49 type FieldRepetitionType int32 50 51 const ( 52 // The field is required (can not be null) and each record has exactly 1 value. 53 Required FieldRepetitionType = 0 54 // The field is optional (can be null) and each record has 0 or 1 values. 55 Optional FieldRepetitionType = 1 56 // The field is repeated and can contain 0 or more values. 57 Repeated FieldRepetitionType = 2 58 ) 59 60 func (t FieldRepetitionType) String() string { 61 switch t { 62 case Required: 63 return "REQUIRED" 64 case Optional: 65 return "OPTIONAL" 66 case Repeated: 67 return "REPEATED" 68 default: 69 return "FieldRepeationaType(?)" 70 } 71 } 72 73 // Statistics per row group and per page. 74 // All fields are optional. 75 type Statistics struct { 76 // DEPRECATED: min and max value of the column. Use min_value and max_value. 77 // 78 // Values are encoded using PLAIN encoding, except that variable-length byte 79 // arrays do not include a length prefix. 80 // 81 // These fields encode min and max values determined by signed comparison 82 // only. New files should use the correct order for a column's logical type 83 // and store the values in the min_value and max_value fields. 84 // 85 // To support older readers, these may be set when the column order is 86 // signed. 87 Max []byte `thrift:"1"` 88 Min []byte `thrift:"2"` 89 // Count of null value in the column. 90 NullCount int64 `thrift:"3"` 91 // Count of distinct values occurring. 92 DistinctCount int64 `thrift:"4"` 93 // Min and max values for the column, determined by its ColumnOrder. 94 // 95 // Values are encoded using PLAIN encoding, except that variable-length byte 96 // arrays do not include a length prefix. 97 MaxValue []byte `thrift:"5"` 98 MinValue []byte `thrift:"6"` 99 } 100 101 // Empty structs to use as logical type annotations. 102 type StringType struct{} // allowed for BINARY, must be encoded with UTF-8 103 type UUIDType struct{} // allowed for FIXED[16], must encode raw UUID bytes 104 type MapType struct{} // see see LogicalTypes.md 105 type ListType struct{} // see LogicalTypes.md 106 type EnumType struct{} // allowed for BINARY, must be encoded with UTF-8 107 type DateType struct{} // allowed for INT32 108 109 func (*StringType) String() string { return "STRING" } 110 func (*UUIDType) String() string { return "UUID" } 111 func (*MapType) String() string { return "MAP" } 112 func (*ListType) String() string { return "LIST" } 113 func (*EnumType) String() string { return "ENUM" } 114 func (*DateType) String() string { return "DATE" } 115 116 // Logical type to annotate a column that is always null. 117 // 118 // Sometimes when discovering the schema of existing data, values are always 119 // null and the physical type can't be determined. This annotation signals 120 // the case where the physical type was guessed from all null values. 121 type NullType struct{} 122 123 func (*NullType) String() string { return "NULL" } 124 125 // Decimal logical type annotation 126 // 127 // To maintain forward-compatibility in v1, implementations using this logical 128 // type must also set scale and precision on the annotated SchemaElement. 129 // 130 // Allowed for physical types: INT32, INT64, FIXED, and BINARY 131 type DecimalType struct { 132 Scale int32 `thrift:"1,required"` 133 Precision int32 `thrift:"2,required"` 134 } 135 136 func (t *DecimalType) String() string { 137 return fmt.Sprintf("DECIMAL(%d,%d)", t.Scale, t.Precision) 138 } 139 140 // Time units for logical types. 141 type MilliSeconds struct{} 142 type MicroSeconds struct{} 143 type NanoSeconds struct{} 144 145 func (*MilliSeconds) String() string { return "MILLIS" } 146 func (*MicroSeconds) String() string { return "MICROS" } 147 func (*NanoSeconds) String() string { return "NANOS" } 148 149 type TimeUnit struct { // union 150 Millis *MilliSeconds `thrift:"1"` 151 Micros *MicroSeconds `thrift:"2"` 152 Nanos *NanoSeconds `thrift:"3"` 153 } 154 155 func (u *TimeUnit) String() string { 156 switch { 157 case u.Millis != nil: 158 return u.Millis.String() 159 case u.Micros != nil: 160 return u.Micros.String() 161 case u.Nanos != nil: 162 return u.Nanos.String() 163 default: 164 return "" 165 } 166 } 167 168 // Timestamp logical type annotation 169 // 170 // Allowed for physical types: INT64 171 type TimestampType struct { 172 IsAdjustedToUTC bool `thrift:"1,required"` 173 Unit TimeUnit `thrift:"2,required"` 174 } 175 176 func (t *TimestampType) String() string { 177 return fmt.Sprintf("TIMESTAMP(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit) 178 } 179 180 // Time logical type annotation 181 // 182 // Allowed for physical types: INT32 (millis), INT64 (micros, nanos) 183 type TimeType struct { 184 IsAdjustedToUTC bool `thrift:"1,required"` 185 Unit TimeUnit `thrift:"2,required"` 186 } 187 188 func (t *TimeType) String() string { 189 return fmt.Sprintf("TIME(isAdjustedToUTC=%t,unit=%s)", t.IsAdjustedToUTC, &t.Unit) 190 } 191 192 // Integer logical type annotation 193 // 194 // bitWidth must be 8, 16, 32, or 64. 195 // 196 // Allowed for physical types: INT32, INT64 197 type IntType struct { 198 BitWidth int8 `thrift:"1,required"` 199 IsSigned bool `thrift:"2,required"` 200 } 201 202 func (t *IntType) String() string { 203 return fmt.Sprintf("INT(%d,%t)", t.BitWidth, t.IsSigned) 204 } 205 206 // Embedded JSON logical type annotation 207 // 208 // Allowed for physical types: BINARY 209 type JsonType struct{} 210 211 func (t *JsonType) String() string { return "JSON" } 212 213 // Embedded BSON logical type annotation 214 // 215 // Allowed for physical types: BINARY 216 type BsonType struct{} 217 218 func (t *BsonType) String() string { return "BSON" } 219 220 // LogicalType annotations to replace ConvertedType. 221 // 222 // To maintain compatibility, implementations using LogicalType for a 223 // SchemaElement must also set the corresponding ConvertedType (if any) 224 // from the following table. 225 type LogicalType struct { // union 226 UTF8 *StringType `thrift:"1"` // use ConvertedType UTF8 227 Map *MapType `thrift:"2"` // use ConvertedType Map 228 List *ListType `thrift:"3"` // use ConvertedType List 229 Enum *EnumType `thrift:"4"` // use ConvertedType Enum 230 Decimal *DecimalType `thrift:"5"` // use ConvertedType Decimal + SchemaElement.{Scale, Precision} 231 Date *DateType `thrift:"6"` // use ConvertedType Date 232 233 // use ConvertedType TimeMicros for Time{IsAdjustedToUTC: *, Unit: Micros} 234 // use ConvertedType TimeMillis for Time{IsAdjustedToUTC: *, Unit: Millis} 235 Time *TimeType `thrift:"7"` 236 237 // use ConvertedType TimestampMicros for Timestamp{IsAdjustedToUTC: *, Unit: Micros} 238 // use ConvertedType TimestampMillis for Timestamp{IsAdjustedToUTC: *, Unit: Millis} 239 Timestamp *TimestampType `thrift:"8"` 240 241 // 9: reserved for Interval 242 Integer *IntType `thrift:"10"` // use ConvertedType Int* or Uint* 243 Unknown *NullType `thrift:"11"` // no compatible ConvertedType 244 Json *JsonType `thrift:"12"` // use ConvertedType JSON 245 Bson *BsonType `thrift:"13"` // use ConvertedType BSON 246 UUID *UUIDType `thrift:"14"` // no compatible ConvertedType 247 } 248 249 func (t *LogicalType) String() string { 250 switch { 251 case t.UTF8 != nil: 252 return t.UTF8.String() 253 case t.Map != nil: 254 return t.Map.String() 255 case t.List != nil: 256 return t.List.String() 257 case t.Enum != nil: 258 return t.Enum.String() 259 case t.Decimal != nil: 260 return t.Decimal.String() 261 case t.Date != nil: 262 return t.Date.String() 263 case t.Time != nil: 264 return t.Time.String() 265 case t.Timestamp != nil: 266 return t.Timestamp.String() 267 case t.Integer != nil: 268 return t.Integer.String() 269 case t.Unknown != nil: 270 return t.Unknown.String() 271 case t.Json != nil: 272 return t.Json.String() 273 case t.Bson != nil: 274 return t.Bson.String() 275 case t.UUID != nil: 276 return t.UUID.String() 277 default: 278 return "" 279 } 280 } 281 282 // Represents a element inside a schema definition. 283 // 284 // - if it is a group (inner node) then type is undefined and num_children is 285 // defined 286 // 287 // - if it is a primitive type (leaf) then type is defined and num_children is 288 // undefined 289 // 290 // The nodes are listed in depth first traversal order. 291 type SchemaElement struct { 292 // Data type for this field. Not set if the current element is a non-leaf node. 293 Type *Type `thrift:"1,optional"` 294 295 // If type is FixedLenByteArray, this is the byte length of the values. 296 // Otherwise, if specified, this is the maximum bit length to store any of the values. 297 // (e.g. a low cardinality INT col could have this set to 3). Note that this is 298 // in the schema, and therefore fixed for the entire file. 299 TypeLength *int32 `thrift:"2,optional"` 300 301 // repetition of the field. The root of the schema does not have a repetition_type. 302 // All other nodes must have one. 303 RepetitionType *FieldRepetitionType `thrift:"3,optional"` 304 305 // Name of the field in the schema. 306 Name string `thrift:"4,required"` 307 308 // Nested fields. Since thrift does not support nested fields, 309 // the nesting is flattened to a single list by a depth-first traversal. 310 // The children count is used to construct the nested relationship. 311 // This field is not set when the element is a primitive type 312 NumChildren int32 `thrift:"5,optional"` 313 314 // DEPRECATED: When the schema is the result of a conversion from another model. 315 // Used to record the original type to help with cross conversion. 316 // 317 // This is superseded by logicalType. 318 ConvertedType *deprecated.ConvertedType `thrift:"6,optional"` 319 320 // DEPRECATED: Used when this column contains decimal data. 321 // See the DECIMAL converted type for more details. 322 // 323 // This is superseded by using the DecimalType annotation in logicalType. 324 Scale *int32 `thrift:"7,optional"` 325 Precision *int32 `thrift:"8,optional"` 326 327 // When the original schema supports field ids, this will save the 328 // original field id in the parquet schema. 329 FieldID int32 `thrift:"9,optional"` 330 331 // The logical type of this SchemaElement 332 // 333 // LogicalType replaces ConvertedType, but ConvertedType is still required 334 // for some logical types to ensure forward-compatibility in format v1. 335 LogicalType *LogicalType `thrift:"10,optional"` 336 } 337 338 // Encodings supported by Parquet. Not all encodings are valid for all types. 339 // These enums are also used to specify the encoding of definition and 340 // repetition levels. See the accompanying doc for the details of the more 341 // complicated encodings. 342 type Encoding int32 343 344 const ( 345 // Default encoding. 346 // Boolean - 1 bit per value. 0 is false; 1 is true. 347 // Int32 - 4 bytes per value. Stored as little-endian. 348 // Int64 - 8 bytes per value. Stored as little-endian. 349 // Float - 4 bytes per value. IEEE. Stored as little-endian. 350 // Double - 8 bytes per value. IEEE. Stored as little-endian. 351 // ByteArray - 4 byte length stored as little endian, followed by bytes. 352 // FixedLenByteArray - Just the bytes. 353 Plain Encoding = 0 354 355 // Group VarInt encoding for Int32/Int64. 356 // This encoding is deprecated. It was never used. 357 // GroupVarInt Encoding = 1 358 359 // Deprecated: Dictionary encoding. The values in the dictionary are encoded 360 // in the plain type. 361 // In a data page use RLEDictionary instead. 362 // In a Dictionary page use Plain instead. 363 PlainDictionary Encoding = 2 364 365 // Group packed run length encoding. Usable for definition/repetition levels 366 // encoding and Booleans (on one bit: 0 is false 1 is true.) 367 RLE Encoding = 3 368 369 // Bit packed encoding. This can only be used if the data has a known max 370 // width. Usable for definition/repetition levels encoding. 371 BitPacked Encoding = 4 372 373 // Delta encoding for integers. This can be used for int columns and works best 374 // on sorted data. 375 DeltaBinaryPacked Encoding = 5 376 377 // Encoding for byte arrays to separate the length values and the data. 378 // The lengths are encoded using DeltaBinaryPacked. 379 DeltaLengthByteArray Encoding = 6 380 381 // Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. 382 // Suffixes are stored as delta length byte arrays. 383 DeltaByteArray Encoding = 7 384 385 // Dictionary encoding: the ids are encoded using the RLE encoding 386 RLEDictionary Encoding = 8 387 388 // Encoding for floating-point data. 389 // K byte-streams are created where K is the size in bytes of the data type. 390 // The individual bytes of an FP value are scattered to the corresponding stream and 391 // the streams are concatenated. 392 // This itself does not reduce the size of the data but can lead to better compression 393 // afterwards. 394 ByteStreamSplit Encoding = 9 395 ) 396 397 func (e Encoding) String() string { 398 switch e { 399 case Plain: 400 return "PLAIN" 401 case PlainDictionary: 402 return "PLAIN_DICTIONARY" 403 case RLE: 404 return "RLE" 405 case BitPacked: 406 return "BIT_PACKED" 407 case DeltaBinaryPacked: 408 return "DELTA_BINARY_PACKED" 409 case DeltaLengthByteArray: 410 return "DELTA_LENGTH_BYTE_ARRAY" 411 case DeltaByteArray: 412 return "DELTA_BYTE_ARRAY" 413 case RLEDictionary: 414 return "RLE_DICTIONARY" 415 case ByteStreamSplit: 416 return "BYTE_STREAM_SPLIT" 417 default: 418 return "Encoding(?)" 419 } 420 } 421 422 // Supported compression algorithms. 423 // 424 // Codecs added in format version X.Y can be read by readers based on X.Y and later. 425 // Codec support may vary between readers based on the format version and 426 // libraries available at runtime. 427 // 428 // See Compression.md for a detailed specification of these algorithms. 429 type CompressionCodec int32 430 431 const ( 432 Uncompressed CompressionCodec = 0 433 Snappy CompressionCodec = 1 434 Gzip CompressionCodec = 2 435 LZO CompressionCodec = 3 436 Brotli CompressionCodec = 4 // Added in 2.4 437 Lz4 CompressionCodec = 5 // DEPRECATED (Added in 2.4) 438 Zstd CompressionCodec = 6 // Added in 2.4 439 Lz4Raw CompressionCodec = 7 // Added in 2.9 440 ) 441 442 func (c CompressionCodec) String() string { 443 switch c { 444 case Uncompressed: 445 return "UNCOMPRESSED" 446 case Snappy: 447 return "SNAPPY" 448 case Gzip: 449 return "GZIP" 450 case LZO: 451 return "LZO" 452 case Brotli: 453 return "BROTLI" 454 case Lz4: 455 return "LZ4" 456 case Zstd: 457 return "ZSTD" 458 case Lz4Raw: 459 return "LZ4_RAW" 460 default: 461 return "CompressionCodec(?)" 462 } 463 } 464 465 type PageType int32 466 467 const ( 468 DataPage PageType = 0 469 IndexPage PageType = 1 470 DictionaryPage PageType = 2 471 // Version 2 is indicated in the PageHeader and the use of DataPageHeaderV2, 472 // and allows you to read repetition and definition level data without 473 // decompressing the Page. 474 DataPageV2 PageType = 3 475 ) 476 477 func (p PageType) String() string { 478 switch p { 479 case DataPage: 480 return "DATA_PAGE" 481 case IndexPage: 482 return "INDEX_PAGE" 483 case DictionaryPage: 484 return "DICTIONARY_PAGE" 485 case DataPageV2: 486 return "DATA_PAGE_V2" 487 default: 488 return "PageType(?)" 489 } 490 } 491 492 // Enum to annotate whether lists of min/max elements inside ColumnIndex 493 // are ordered and if so, in which direction. 494 type BoundaryOrder int32 495 496 const ( 497 Unordered BoundaryOrder = 0 498 Ascending BoundaryOrder = 1 499 Descending BoundaryOrder = 2 500 ) 501 502 func (b BoundaryOrder) String() string { 503 switch b { 504 case Unordered: 505 return "UNORDERED" 506 case Ascending: 507 return "ASCENDING" 508 case Descending: 509 return "DESCENDING" 510 default: 511 return "BoundaryOrder(?)" 512 } 513 } 514 515 // Data page header. 516 type DataPageHeader struct { 517 // Number of values, including NULLs, in this data page. 518 NumValues int32 `thrift:"1,required"` 519 520 // Encoding used for this data page. 521 Encoding Encoding `thrift:"2,required"` 522 523 // Encoding used for definition levels. 524 DefinitionLevelEncoding Encoding `thrift:"3,required"` 525 526 // Encoding used for repetition levels. 527 RepetitionLevelEncoding Encoding `thrift:"4,required"` 528 529 // Optional statistics for the data in this page. 530 Statistics Statistics `thrift:"5,optional"` 531 } 532 533 type IndexPageHeader struct { 534 // TODO 535 } 536 537 // The dictionary page must be placed at the first position of the column chunk 538 // if it is partly or completely dictionary encoded. At most one dictionary page 539 // can be placed in a column chunk. 540 type DictionaryPageHeader struct { 541 // Number of values in the dictionary. 542 NumValues int32 `thrift:"1,required"` 543 544 // Encoding using this dictionary page. 545 Encoding Encoding `thrift:"2,required"` 546 547 // If true, the entries in the dictionary are sorted in ascending order. 548 IsSorted bool `thrift:"3,optional"` 549 } 550 551 // New page format allowing reading levels without decompressing the data 552 // Repetition and definition levels are uncompressed 553 // The remaining section containing the data is compressed if is_compressed is 554 // true. 555 type DataPageHeaderV2 struct { 556 // Number of values, including NULLs, in this data page. 557 NumValues int32 `thrift:"1,required"` 558 // Number of NULL values, in this data page. 559 // Number of non-null = num_values - num_nulls which is also the number of 560 // values in the data section. 561 NumNulls int32 `thrift:"2,required"` 562 // Number of rows in this data page. which means pages change on record boundaries (r = 0). 563 NumRows int32 `thrift:"3,required"` 564 // Encoding used for data in this page. 565 Encoding Encoding `thrift:"4,required"` 566 567 // Repetition levels and definition levels are always using RLE (without size in it). 568 569 // Length of the definition levels. 570 DefinitionLevelsByteLength int32 `thrift:"5,required"` 571 // Length of the repetition levels. 572 RepetitionLevelsByteLength int32 `thrift:"6,required"` 573 574 // Whether the values are compressed. 575 // Which means the section of the page between 576 // definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) 577 // is compressed with the compression_codec. 578 // If missing it is considered compressed. 579 IsCompressed *bool `thrift:"7,optional"` 580 581 // Optional statistics for the data in this page. 582 Statistics Statistics `thrift:"8,optional"` 583 } 584 585 // Block-based algorithm type annotation. 586 type SplitBlockAlgorithm struct{} 587 588 // The algorithm used in Bloom filter. 589 type BloomFilterAlgorithm struct { // union 590 Block *SplitBlockAlgorithm `thrift:"1"` 591 } 592 593 // Hash strategy type annotation. xxHash is an extremely fast non-cryptographic 594 // hash algorithm. It uses 64 bits version of xxHash. 595 type XxHash struct{} 596 597 // The hash function used in Bloom filter. This function takes the hash of a 598 // column value using plain encoding. 599 type BloomFilterHash struct { // union 600 XxHash *XxHash `thrift:"1"` 601 } 602 603 // The compression used in the Bloom filter. 604 type BloomFilterUncompressed struct{} 605 type BloomFilterCompression struct { // union 606 Uncompressed *BloomFilterUncompressed `thrift:"1"` 607 } 608 609 // Bloom filter header is stored at beginning of Bloom filter data of each column 610 // and followed by its bitset. 611 type BloomFilterHeader struct { 612 // The size of bitset in bytes. 613 NumBytes int32 `thrift:"1,required"` 614 // The algorithm for setting bits. 615 Algorithm BloomFilterAlgorithm `thrift:"2,required"` 616 // The hash function used for Bloom filter. 617 Hash BloomFilterHash `thrift:"3,required"` 618 // The compression used in the Bloom filter. 619 Compression BloomFilterCompression `thrift:"4,required"` 620 } 621 622 type PageHeader struct { 623 // The type of the page indicates which of the *Header fields below is set. 624 Type PageType `thrift:"1,required"` 625 626 // Uncompressed page size in bytes (not including this header). 627 UncompressedPageSize int32 `thrift:"2,required"` 628 629 // Compressed (and potentially encrypted) page size in bytes, not including 630 // this header. 631 CompressedPageSize int32 `thrift:"3,required"` 632 633 // The 32bit CRC for the page, to be be calculated as follows: 634 // - Using the standard CRC32 algorithm 635 // - On the data only, i.e. this header should not be included. 'Data' 636 // hereby refers to the concatenation of the repetition levels, the 637 // definition levels and the column value, in this exact order. 638 // - On the encoded versions of the repetition levels, definition levels and 639 // column values. 640 // - On the compressed versions of the repetition levels, definition levels 641 // and column values where possible; 642 // - For v1 data pages, the repetition levels, definition levels and column 643 // values are always compressed together. If a compression scheme is 644 // specified, the CRC shall be calculated on the compressed version of 645 // this concatenation. If no compression scheme is specified, the CRC 646 // shall be calculated on the uncompressed version of this concatenation. 647 // - For v2 data pages, the repetition levels and definition levels are 648 // handled separately from the data and are never compressed (only 649 // encoded). If a compression scheme is specified, the CRC shall be 650 // calculated on the concatenation of the uncompressed repetition levels, 651 // uncompressed definition levels and the compressed column values. 652 // If no compression scheme is specified, the CRC shall be calculated on 653 // the uncompressed concatenation. 654 // - In encrypted columns, CRC is calculated after page encryption; the 655 // encryption itself is performed after page compression (if compressed) 656 // If enabled, this allows for disabling checksumming in HDFS if only a few 657 // pages need to be read. 658 CRC int32 `thrift:"4,optional"` 659 660 // Headers for page specific data. One only will be set. 661 DataPageHeader *DataPageHeader `thrift:"5,optional"` 662 IndexPageHeader *IndexPageHeader `thrift:"6,optional"` 663 DictionaryPageHeader *DictionaryPageHeader `thrift:"7,optional"` 664 DataPageHeaderV2 *DataPageHeaderV2 `thrift:"8,optional"` 665 } 666 667 // Wrapper struct to store key values. 668 type KeyValue struct { 669 Key string `thrift:"1,required"` 670 Value string `thrift:"2,required"` 671 } 672 673 // Wrapper struct to specify sort order. 674 type SortingColumn struct { 675 // The column index (in this row group) 676 ColumnIdx int32 `thrift:"1,required"` 677 678 // If true, indicates this column is sorted in descending order. 679 Descending bool `thrift:"2,required"` 680 681 // If true, nulls will come before non-null values, otherwise, 682 // nulls go at the end. 683 NullsFirst bool `thrift:"3,required"` 684 } 685 686 // Statistics of a given page type and encoding. 687 type PageEncodingStats struct { 688 // The page type (data/dic/...). 689 PageType PageType `thrift:"1,required"` 690 691 // Encoding of the page. 692 Encoding Encoding `thrift:"2,required"` 693 694 // Number of pages of this type with this encoding. 695 Count int32 `thrift:"3,required"` 696 } 697 698 // Description for column metadata. 699 type ColumnMetaData struct { 700 // Type of this column. 701 Type Type `thrift:"1,required"` 702 703 // Set of all encodings used for this column. The purpose is to validate 704 // whether we can decode those pages. 705 Encoding []Encoding `thrift:"2,required"` 706 707 // Path in schema. 708 PathInSchema []string `thrift:"3,required"` 709 710 // Compression codec. 711 Codec CompressionCodec `thrift:"4,required"` 712 713 // Number of values in this column. 714 NumValues int64 `thrift:"5,required"` 715 716 // Total byte size of all uncompressed pages in this column chunk (including the headers). 717 TotalUncompressedSize int64 `thrift:"6,required"` 718 719 // Total byte size of all compressed, and potentially encrypted, pages 720 // in this column chunk (including the headers). 721 TotalCompressedSize int64 `thrift:"7,required"` 722 723 // Optional key/value metadata. 724 KeyValueMetadata []KeyValue `thrift:"8,optional"` 725 726 // Byte offset from beginning of file to first data page. 727 DataPageOffset int64 `thrift:"9,required"` 728 729 // Byte offset from beginning of file to root index page. 730 IndexPageOffset int64 `thrift:"10,optional"` 731 732 // Byte offset from the beginning of file to first (only) dictionary page. 733 DictionaryPageOffset int64 `thrift:"11,optional"` 734 735 // optional statistics for this column chunk. 736 Statistics Statistics `thrift:"12,optional"` 737 738 // Set of all encodings used for pages in this column chunk. 739 // This information can be used to determine if all data pages are 740 // dictionary encoded for example. 741 EncodingStats []PageEncodingStats `thrift:"13,optional"` 742 743 // Byte offset from beginning of file to Bloom filter data. 744 BloomFilterOffset int64 `thrift:"14,optional"` 745 } 746 747 type EncryptionWithFooterKey struct{} 748 749 type EncryptionWithColumnKey struct { 750 // Column path in schema. 751 PathInSchema []string `thrift:"1,required"` 752 753 // Retrieval metadata of column encryption key. 754 KeyMetadata []byte `thrift:"2,optional"` 755 } 756 757 type ColumnCryptoMetaData struct { 758 EncryptionWithFooterKey *EncryptionWithFooterKey `thrift:"1"` 759 EncryptionWithColumnKey *EncryptionWithColumnKey `thrift:"2"` 760 } 761 762 type ColumnChunk struct { 763 // File where column data is stored. If not set, assumed to be same file as 764 // metadata. This path is relative to the current file. 765 FilePath string `thrift:"1,optional"` 766 767 // Byte offset in file_path to the ColumnMetaData. 768 FileOffset int64 `thrift:"2,required"` 769 770 // Column metadata for this chunk. This is the same content as what is at 771 // file_path/file_offset. Having it here has it replicated in the file 772 // metadata. 773 MetaData ColumnMetaData `thrift:"3,optional"` 774 775 // File offset of ColumnChunk's OffsetIndex. 776 OffsetIndexOffset int64 `thrift:"4,optional"` 777 778 // Size of ColumnChunk's OffsetIndex, in bytes. 779 OffsetIndexLength int32 `thrift:"5,optional"` 780 781 // File offset of ColumnChunk's ColumnIndex. 782 ColumnIndexOffset int64 `thrift:"6,optional"` 783 784 // Size of ColumnChunk's ColumnIndex, in bytes. 785 ColumnIndexLength int32 `thrift:"7,optional"` 786 787 // Crypto metadata of encrypted columns. 788 CryptoMetadata ColumnCryptoMetaData `thrift:"8,optional"` 789 790 // Encrypted column metadata for this chunk. 791 EncryptedColumnMetadata []byte `thrift:"9,optional"` 792 } 793 794 type RowGroup struct { 795 // Metadata for each column chunk in this row group. 796 // This list must have the same order as the SchemaElement list in FileMetaData. 797 Columns []ColumnChunk `thrift:"1,required"` 798 799 // Total byte size of all the uncompressed column data in this row group. 800 TotalByteSize int64 `thrift:"2,required"` 801 802 // Number of rows in this row group. 803 NumRows int64 `thrift:"3,required"` 804 805 // If set, specifies a sort ordering of the rows in this RowGroup. 806 // The sorting columns can be a subset of all the columns. 807 SortingColumns []SortingColumn `thrift:"4,optional"` 808 809 // Byte offset from beginning of file to first page (data or dictionary) 810 // in this row group 811 FileOffset int64 `thrift:"5,optional"` 812 813 // Total byte size of all compressed (and potentially encrypted) column data 814 // in this row group. 815 TotalCompressedSize int64 `thrift:"6,optional"` 816 817 // Row group ordinal in the file. 818 Ordinal int16 `thrift:"7,optional"` 819 } 820 821 // Empty struct to signal the order defined by the physical or logical type. 822 type TypeDefinedOrder struct{} 823 824 // Union to specify the order used for the min_value and max_value fields for a 825 // column. This union takes the role of an enhanced enum that allows rich 826 // elements (which will be needed for a collation-based ordering in the future). 827 // 828 // Possible values are: 829 // 830 // TypeDefinedOrder - the column uses the order defined by its logical or 831 // physical type (if there is no logical type). 832 // 833 // If the reader does not support the value of this union, min and max stats 834 // for this column should be ignored. 835 type ColumnOrder struct { // union 836 // The sort orders for logical types are: 837 // UTF8 - unsigned byte-wise comparison 838 // INT8 - signed comparison 839 // INT16 - signed comparison 840 // INT32 - signed comparison 841 // INT64 - signed comparison 842 // UINT8 - unsigned comparison 843 // UINT16 - unsigned comparison 844 // UINT32 - unsigned comparison 845 // UINT64 - unsigned comparison 846 // DECIMAL - signed comparison of the represented value 847 // DATE - signed comparison 848 // TIME_MILLIS - signed comparison 849 // TIME_MICROS - signed comparison 850 // TIMESTAMP_MILLIS - signed comparison 851 // TIMESTAMP_MICROS - signed comparison 852 // INTERVAL - unsigned comparison 853 // JSON - unsigned byte-wise comparison 854 // BSON - unsigned byte-wise comparison 855 // ENUM - unsigned byte-wise comparison 856 // LIST - undefined 857 // MAP - undefined 858 // 859 // In the absence of logical types, the sort order is determined by the physical type: 860 // BOOLEAN - false, true 861 // INT32 - signed comparison 862 // INT64 - signed comparison 863 // INT96 (only used for legacy timestamps) - undefined 864 // FLOAT - signed comparison of the represented value (*) 865 // DOUBLE - signed comparison of the represented value (*) 866 // BYTE_ARRAY - unsigned byte-wise comparison 867 // FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison 868 // 869 // (*) Because the sorting order is not specified properly for floating 870 // point values (relations vs. total ordering) the following 871 // compatibility rules should be applied when reading statistics: 872 // - If the min is a NaN, it should be ignored. 873 // - If the max is a NaN, it should be ignored. 874 // - If the min is +0, the row group may contain -0 values as well. 875 // - If the max is -0, the row group may contain +0 values as well. 876 // - When looking for NaN values, min and max should be ignored. 877 TypeOrder *TypeDefinedOrder `thrift:"1"` 878 } 879 880 type PageLocation struct { 881 // Offset of the page in the file. 882 Offset int64 `thrift:"1,required"` 883 884 // Size of the page, including header. Sum of compressed_page_size and 885 // header length. 886 CompressedPageSize int32 `thrift:"2,required"` 887 888 // Index within the RowGroup of the first row of the page; this means 889 // pages change on record boundaries (r = 0). 890 FirstRowIndex int64 `thrift:"3,required"` 891 } 892 893 type OffsetIndex struct { 894 // PageLocations, ordered by increasing PageLocation.offset. It is required 895 // that page_locations[i].first_row_index < page_locations[i+1].first_row_index. 896 PageLocations []PageLocation `thrift:"1,required"` 897 } 898 899 // Description for ColumnIndex. 900 // Each <array-field>[i] refers to the page at OffsetIndex.PageLocations[i] 901 type ColumnIndex struct { 902 // A list of Boolean values to determine the validity of the corresponding 903 // min and max values. If true, a page contains only null values, and writers 904 // have to set the corresponding entries in min_values and max_values to 905 // byte[0], so that all lists have the same length. If false, the 906 // corresponding entries in min_values and max_values must be valid. 907 NullPages []bool `thrift:"1,required"` 908 909 // Two lists containing lower and upper bounds for the values of each page 910 // determined by the ColumnOrder of the column. These may be the actual 911 // minimum and maximum values found on a page, but can also be (more compact) 912 // values that do not exist on a page. For example, instead of storing ""Blart 913 // Versenwald III", a writer may set min_values[i]="B", max_values[i]="C". 914 // Such more compact values must still be valid values within the column's 915 // logical type. Readers must make sure that list entries are populated before 916 // using them by inspecting null_pages. 917 MinValues [][]byte `thrift:"2,required"` 918 MaxValues [][]byte `thrift:"3,required"` 919 920 // Stores whether both min_values and max_values are ordered and if so, in 921 // which direction. This allows readers to perform binary searches in both 922 // lists. Readers cannot assume that max_values[i] <= min_values[i+1], even 923 // if the lists are ordered. 924 BoundaryOrder BoundaryOrder `thrift:"4,required"` 925 926 // A list containing the number of null values for each page. 927 NullCounts []int64 `thrift:"5,optional"` 928 } 929 930 type AesGcmV1 struct { 931 // AAD prefix. 932 AadPrefix []byte `thrift:"1,optional"` 933 934 // Unique file identifier part of AAD suffix. 935 AadFileUnique []byte `thrift:"2,optional"` 936 937 // In files encrypted with AAD prefix without storing it, 938 // readers must supply the prefix. 939 SupplyAadPrefix bool `thrift:"3,optional"` 940 } 941 942 type AesGcmCtrV1 struct { 943 // AAD prefix. 944 AadPrefix []byte `thrift:"1,optional"` 945 946 // Unique file identifier part of AAD suffix. 947 AadFileUnique []byte `thrift:"2,optional"` 948 949 // In files encrypted with AAD prefix without storing it, 950 // readers must supply the prefix. 951 SupplyAadPrefix bool `thrift:"3,optional"` 952 } 953 954 type EncryptionAlgorithm struct { // union 955 AesGcmV1 *AesGcmV1 `thrift:"1"` 956 AesGcmCtrV1 *AesGcmCtrV1 `thrift:"2"` 957 } 958 959 // Description for file metadata. 960 type FileMetaData struct { 961 // Version of this file. 962 Version int32 `thrift:"1,required"` 963 964 // Parquet schema for this file. This schema contains metadata for all the columns. 965 // The schema is represented as a tree with a single root. The nodes of the tree 966 // are flattened to a list by doing a depth-first traversal. 967 // The column metadata contains the path in the schema for that column which can be 968 // used to map columns to nodes in the schema. 969 // The first element is the root. 970 Schema []SchemaElement `thrift:"2,required"` 971 972 // Number of rows in this file. 973 NumRows int64 `thrift:"3,required"` 974 975 // Row groups in this file. 976 RowGroups []RowGroup `thrift:"4,required"` 977 978 // Optional key/value metadata. 979 KeyValueMetadata []KeyValue `thrift:"5,optional"` 980 981 // String for application that wrote this file. This should be in the format 982 // <Application> version <App Version> (build <App Build Hash>). 983 // e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) 984 CreatedBy string `thrift:"6,optional"` 985 986 // Sort order used for the min_value and max_value fields in the Statistics 987 // objects and the min_values and max_values fields in the ColumnIndex 988 // objects of each column in this file. Sort orders are listed in the order 989 // matching the columns in the schema. The indexes are not necessary the same 990 // though, because only leaf nodes of the schema are represented in the list 991 // of sort orders. 992 // 993 // Without column_orders, the meaning of the min_value and max_value fields 994 // in the Statistics object and the ColumnIndex object is undefined. To ensure 995 // well-defined behavior, if these fields are written to a Parquet file, 996 // column_orders must be written as well. 997 // 998 // The obsolete min and max fields in the Statistics object are always sorted 999 // by signed comparison regardless of column_orders. 1000 ColumnOrders []ColumnOrder `thrift:"7,optional"` 1001 1002 // Encryption algorithm. This field is set only in encrypted files 1003 // with plaintext footer. Files with encrypted footer store algorithm id 1004 // in FileCryptoMetaData structure. 1005 EncryptionAlgorithm EncryptionAlgorithm `thrift:"8,optional"` 1006 1007 // Retrieval metadata of key used for signing the footer. 1008 // Used only in encrypted files with plaintext footer. 1009 FooterSigningKeyMetadata []byte `thrift:"9,optional"` 1010 } 1011 1012 // Crypto metadata for files with encrypted footer. 1013 type FileCryptoMetaData struct { 1014 // Encryption algorithm. This field is only used for files 1015 // with encrypted footer. Files with plaintext footer store algorithm id 1016 // inside footer (FileMetaData structure). 1017 EncryptionAlgorithm EncryptionAlgorithm `thrift:"1,required"` 1018 1019 // Retrieval metadata of key used for encryption of footer, 1020 // and (possibly) columns. 1021 KeyMetadata []byte `thrift:"2,optional"` 1022 }