github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/column.go (about) 1 package parquet 2 3 import ( 4 "encoding/binary" 5 "fmt" 6 "io" 7 "reflect" 8 9 "github.com/parquet-go/parquet-go/compress" 10 "github.com/parquet-go/parquet-go/deprecated" 11 "github.com/parquet-go/parquet-go/encoding" 12 "github.com/parquet-go/parquet-go/format" 13 "github.com/parquet-go/parquet-go/internal/unsafecast" 14 ) 15 16 // Column represents a column in a parquet file. 17 // 18 // Methods of Column values are safe to call concurrently from multiple 19 // goroutines. 20 // 21 // Column instances satisfy the Node interface. 22 type Column struct { 23 typ Type 24 file *File 25 schema *format.SchemaElement 26 order *format.ColumnOrder 27 path columnPath 28 columns []*Column 29 chunks []*format.ColumnChunk 30 columnIndex []*format.ColumnIndex 31 offsetIndex []*format.OffsetIndex 32 encoding encoding.Encoding 33 compression compress.Codec 34 35 depth int8 36 maxRepetitionLevel byte 37 maxDefinitionLevel byte 38 index int16 39 } 40 41 // Type returns the type of the column. 42 // 43 // The returned value is unspecified if c is not a leaf column. 44 func (c *Column) Type() Type { return c.typ } 45 46 // Optional returns true if the column is optional. 47 func (c *Column) Optional() bool { return schemaRepetitionTypeOf(c.schema) == format.Optional } 48 49 // Repeated returns true if the column may repeat. 50 func (c *Column) Repeated() bool { return schemaRepetitionTypeOf(c.schema) == format.Repeated } 51 52 // Required returns true if the column is required. 53 func (c *Column) Required() bool { return schemaRepetitionTypeOf(c.schema) == format.Required } 54 55 // Leaf returns true if c is a leaf column. 56 func (c *Column) Leaf() bool { return c.index >= 0 } 57 58 // Fields returns the list of fields on the column. 59 func (c *Column) Fields() []Field { 60 fields := make([]Field, len(c.columns)) 61 for i, column := range c.columns { 62 fields[i] = column 63 } 64 return fields 65 } 66 67 // Encoding returns the encodings used by this column. 68 func (c *Column) Encoding() encoding.Encoding { return c.encoding } 69 70 // Compression returns the compression codecs used by this column. 71 func (c *Column) Compression() compress.Codec { return c.compression } 72 73 // Path of the column in the parquet schema. 74 func (c *Column) Path() []string { return c.path[1:] } 75 76 // Name returns the column name. 77 func (c *Column) Name() string { return c.schema.Name } 78 79 // ID returns column field id 80 func (c *Column) ID() int { return int(c.schema.FieldID) } 81 82 // Columns returns the list of child columns. 83 // 84 // The method returns the same slice across multiple calls, the program must 85 // treat it as a read-only value. 86 func (c *Column) Columns() []*Column { return c.columns } 87 88 // Column returns the child column matching the given name. 89 func (c *Column) Column(name string) *Column { 90 for _, child := range c.columns { 91 if child.Name() == name { 92 return child 93 } 94 } 95 return nil 96 } 97 98 // Pages returns a reader exposing all pages in this column, across row groups. 99 func (c *Column) Pages() Pages { 100 if c.index < 0 { 101 return emptyPages{} 102 } 103 r := &columnPages{ 104 pages: make([]filePages, len(c.file.rowGroups)), 105 } 106 for i := range r.pages { 107 r.pages[i].init(c.file.rowGroups[i].(*fileRowGroup).columns[c.index].(*fileColumnChunk)) 108 } 109 return r 110 } 111 112 type columnPages struct { 113 pages []filePages 114 index int 115 } 116 117 func (c *columnPages) ReadPage() (Page, error) { 118 for { 119 if c.index >= len(c.pages) { 120 return nil, io.EOF 121 } 122 p, err := c.pages[c.index].ReadPage() 123 if err == nil || err != io.EOF { 124 return p, err 125 } 126 c.index++ 127 } 128 } 129 130 func (c *columnPages) SeekToRow(rowIndex int64) error { 131 c.index = 0 132 133 for c.index < len(c.pages) && c.pages[c.index].chunk.rowGroup.NumRows < rowIndex { 134 rowIndex -= c.pages[c.index].chunk.rowGroup.NumRows 135 c.index++ 136 } 137 138 if c.index < len(c.pages) { 139 if err := c.pages[c.index].SeekToRow(rowIndex); err != nil { 140 return err 141 } 142 for i := c.index + 1; i < len(c.pages); i++ { 143 p := &c.pages[i] 144 if err := p.SeekToRow(0); err != nil { 145 return err 146 } 147 } 148 } 149 return nil 150 } 151 152 func (c *columnPages) Close() error { 153 var lastErr error 154 155 for i := range c.pages { 156 if err := c.pages[i].Close(); err != nil { 157 lastErr = err 158 } 159 } 160 161 c.pages = nil 162 c.index = 0 163 return lastErr 164 } 165 166 // Depth returns the position of the column relative to the root. 167 func (c *Column) Depth() int { return int(c.depth) } 168 169 // MaxRepetitionLevel returns the maximum value of repetition levels on this 170 // column. 171 func (c *Column) MaxRepetitionLevel() int { return int(c.maxRepetitionLevel) } 172 173 // MaxDefinitionLevel returns the maximum value of definition levels on this 174 // column. 175 func (c *Column) MaxDefinitionLevel() int { return int(c.maxDefinitionLevel) } 176 177 // Index returns the position of the column in a row. Only leaf columns have a 178 // column index, the method returns -1 when called on non-leaf columns. 179 func (c *Column) Index() int { return int(c.index) } 180 181 // GoType returns the Go type that best represents the parquet column. 182 func (c *Column) GoType() reflect.Type { return goTypeOf(c) } 183 184 // Value returns the sub-value in base for the child column at the given 185 // index. 186 func (c *Column) Value(base reflect.Value) reflect.Value { 187 return base.MapIndex(reflect.ValueOf(&c.schema.Name).Elem()) 188 } 189 190 // String returns a human-readable string representation of the column. 191 func (c *Column) String() string { return c.path.String() + ": " + sprint(c.Name(), c) } 192 193 func (c *Column) forEachLeaf(do func(*Column)) { 194 if len(c.columns) == 0 { 195 do(c) 196 } else { 197 for _, child := range c.columns { 198 child.forEachLeaf(do) 199 } 200 } 201 } 202 203 func openColumns(file *File) (*Column, error) { 204 cl := columnLoader{} 205 206 c, err := cl.open(file, nil) 207 if err != nil { 208 return nil, err 209 } 210 211 // Validate that there aren't extra entries in the row group columns, 212 // which would otherwise indicate that there are dangling data pages 213 // in the file. 214 for index, rowGroup := range file.metadata.RowGroups { 215 if cl.rowGroupColumnIndex != len(rowGroup.Columns) { 216 return nil, fmt.Errorf("row group at index %d contains %d columns but %d were referenced by the column schemas", 217 index, len(rowGroup.Columns), cl.rowGroupColumnIndex) 218 } 219 } 220 221 _, err = c.setLevels(0, 0, 0, 0) 222 return c, err 223 } 224 225 func (c *Column) setLevels(depth, repetition, definition, index int) (int, error) { 226 if depth > MaxColumnDepth { 227 return -1, fmt.Errorf("cannot represent parquet columns with more than %d nested levels: %s", MaxColumnDepth, c.path) 228 } 229 if index > MaxColumnIndex { 230 return -1, fmt.Errorf("cannot represent parquet rows with more than %d columns: %s", MaxColumnIndex, c.path) 231 } 232 if repetition > MaxRepetitionLevel { 233 return -1, fmt.Errorf("cannot represent parquet columns with more than %d repetition levels: %s", MaxRepetitionLevel, c.path) 234 } 235 if definition > MaxDefinitionLevel { 236 return -1, fmt.Errorf("cannot represent parquet columns with more than %d definition levels: %s", MaxDefinitionLevel, c.path) 237 } 238 239 switch schemaRepetitionTypeOf(c.schema) { 240 case format.Optional: 241 definition++ 242 case format.Repeated: 243 repetition++ 244 definition++ 245 } 246 247 c.depth = int8(depth) 248 c.maxRepetitionLevel = byte(repetition) 249 c.maxDefinitionLevel = byte(definition) 250 depth++ 251 252 if len(c.columns) > 0 { 253 c.index = -1 254 } else { 255 c.index = int16(index) 256 index++ 257 } 258 259 var err error 260 for _, child := range c.columns { 261 if index, err = child.setLevels(depth, repetition, definition, index); err != nil { 262 return -1, err 263 } 264 } 265 return index, nil 266 } 267 268 type columnLoader struct { 269 schemaIndex int 270 columnOrderIndex int 271 rowGroupColumnIndex int 272 } 273 274 func (cl *columnLoader) open(file *File, path []string) (*Column, error) { 275 c := &Column{ 276 file: file, 277 schema: &file.metadata.Schema[cl.schemaIndex], 278 } 279 c.path = columnPath(path).append(c.schema.Name) 280 281 cl.schemaIndex++ 282 numChildren := int(c.schema.NumChildren) 283 284 if numChildren == 0 { 285 c.typ = schemaElementTypeOf(c.schema) 286 287 if cl.columnOrderIndex < len(file.metadata.ColumnOrders) { 288 c.order = &file.metadata.ColumnOrders[cl.columnOrderIndex] 289 cl.columnOrderIndex++ 290 } 291 292 rowGroups := file.metadata.RowGroups 293 rowGroupColumnIndex := cl.rowGroupColumnIndex 294 cl.rowGroupColumnIndex++ 295 296 c.chunks = make([]*format.ColumnChunk, 0, len(rowGroups)) 297 c.columnIndex = make([]*format.ColumnIndex, 0, len(rowGroups)) 298 c.offsetIndex = make([]*format.OffsetIndex, 0, len(rowGroups)) 299 300 for i, rowGroup := range rowGroups { 301 if rowGroupColumnIndex >= len(rowGroup.Columns) { 302 return nil, fmt.Errorf("row group at index %d does not have enough columns", i) 303 } 304 c.chunks = append(c.chunks, &rowGroup.Columns[rowGroupColumnIndex]) 305 } 306 307 if len(file.columnIndexes) > 0 { 308 for i := range rowGroups { 309 if rowGroupColumnIndex >= len(file.columnIndexes) { 310 return nil, fmt.Errorf("row group at index %d does not have enough column index pages", i) 311 } 312 c.columnIndex = append(c.columnIndex, &file.columnIndexes[rowGroupColumnIndex]) 313 } 314 } 315 316 if len(file.offsetIndexes) > 0 { 317 for i := range rowGroups { 318 if rowGroupColumnIndex >= len(file.offsetIndexes) { 319 return nil, fmt.Errorf("row group at index %d does not have enough offset index pages", i) 320 } 321 c.offsetIndex = append(c.offsetIndex, &file.offsetIndexes[rowGroupColumnIndex]) 322 } 323 } 324 325 if len(c.chunks) > 0 { 326 // Pick the encoding and compression codec of the first chunk. 327 // 328 // Technically each column chunk may use a different compression 329 // codec, and each page of the column chunk might have a different 330 // encoding. Exposing these details does not provide a lot of value 331 // to the end user. 332 // 333 // Programs that wish to determine the encoding and compression of 334 // each page of the column should iterate through the pages and read 335 // the page headers to determine which compression and encodings are 336 // applied. 337 for _, encoding := range c.chunks[0].MetaData.Encoding { 338 if c.encoding == nil { 339 c.encoding = LookupEncoding(encoding) 340 } 341 if encoding != format.Plain && encoding != format.RLE { 342 c.encoding = LookupEncoding(encoding) 343 break 344 } 345 } 346 c.compression = LookupCompressionCodec(c.chunks[0].MetaData.Codec) 347 } 348 349 return c, nil 350 } 351 352 c.typ = &groupType{} 353 if lt := c.schema.LogicalType; lt != nil && lt.Map != nil { 354 c.typ = &mapType{} 355 } 356 c.columns = make([]*Column, numChildren) 357 358 for i := range c.columns { 359 if cl.schemaIndex >= len(file.metadata.Schema) { 360 return nil, fmt.Errorf("column %q has more children than there are schemas in the file: %d > %d", 361 c.schema.Name, cl.schemaIndex+1, len(file.metadata.Schema)) 362 } 363 364 var err error 365 c.columns[i], err = cl.open(file, c.path) 366 if err != nil { 367 return nil, fmt.Errorf("%s: %w", c.schema.Name, err) 368 } 369 } 370 371 return c, nil 372 } 373 374 func schemaElementTypeOf(s *format.SchemaElement) Type { 375 if lt := s.LogicalType; lt != nil { 376 // A logical type exists, the Type interface implementations in this 377 // package are all based on the logical parquet types declared in the 378 // format sub-package so we can return them directly via a pointer type 379 // conversion. 380 switch { 381 case lt.UTF8 != nil: 382 return (*stringType)(lt.UTF8) 383 case lt.Map != nil: 384 return (*mapType)(lt.Map) 385 case lt.List != nil: 386 return (*listType)(lt.List) 387 case lt.Enum != nil: 388 return (*enumType)(lt.Enum) 389 case lt.Decimal != nil: 390 // A parquet decimal can be one of several different physical types. 391 if t := s.Type; t != nil { 392 var typ Type 393 switch kind := Kind(*s.Type); kind { 394 case Int32: 395 typ = Int32Type 396 case Int64: 397 typ = Int64Type 398 case FixedLenByteArray: 399 if s.TypeLength == nil { 400 panic("DECIMAL using FIXED_LEN_BYTE_ARRAY must specify a length") 401 } 402 typ = FixedLenByteArrayType(int(*s.TypeLength)) 403 default: 404 panic("DECIMAL must be of type INT32, INT64, or FIXED_LEN_BYTE_ARRAY but got " + kind.String()) 405 } 406 return &decimalType{ 407 decimal: *lt.Decimal, 408 Type: typ, 409 } 410 } 411 case lt.Date != nil: 412 return (*dateType)(lt.Date) 413 case lt.Time != nil: 414 return (*timeType)(lt.Time) 415 case lt.Timestamp != nil: 416 return (*timestampType)(lt.Timestamp) 417 case lt.Integer != nil: 418 return (*intType)(lt.Integer) 419 case lt.Unknown != nil: 420 return (*nullType)(lt.Unknown) 421 case lt.Json != nil: 422 return (*jsonType)(lt.Json) 423 case lt.Bson != nil: 424 return (*bsonType)(lt.Bson) 425 case lt.UUID != nil: 426 return (*uuidType)(lt.UUID) 427 } 428 } 429 430 if ct := s.ConvertedType; ct != nil { 431 // This column contains no logical type but has a converted type, it 432 // was likely created by an older parquet writer. Convert the legacy 433 // type representation to the equivalent logical parquet type. 434 switch *ct { 435 case deprecated.UTF8: 436 return &stringType{} 437 case deprecated.Map: 438 return &mapType{} 439 case deprecated.MapKeyValue: 440 return &groupType{} 441 case deprecated.List: 442 return &listType{} 443 case deprecated.Enum: 444 return &enumType{} 445 case deprecated.Decimal: 446 if s.Scale != nil && s.Precision != nil { 447 // A parquet decimal can be one of several different physical types. 448 if t := s.Type; t != nil { 449 var typ Type 450 switch kind := Kind(*s.Type); kind { 451 case Int32: 452 typ = Int32Type 453 case Int64: 454 typ = Int64Type 455 case FixedLenByteArray: 456 if s.TypeLength == nil { 457 panic("DECIMAL using FIXED_LEN_BYTE_ARRAY must specify a length") 458 } 459 typ = FixedLenByteArrayType(int(*s.TypeLength)) 460 case ByteArray: 461 typ = ByteArrayType 462 default: 463 panic("DECIMAL must be of type INT32, INT64, BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY but got " + kind.String()) 464 } 465 return &decimalType{ 466 decimal: format.DecimalType{ 467 Scale: *s.Scale, 468 Precision: *s.Precision, 469 }, 470 Type: typ, 471 } 472 } 473 } 474 case deprecated.Date: 475 return &dateType{} 476 case deprecated.TimeMillis: 477 return &timeType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()} 478 case deprecated.TimeMicros: 479 return &timeType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()} 480 case deprecated.TimestampMillis: 481 return ×tampType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()} 482 case deprecated.TimestampMicros: 483 return ×tampType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()} 484 case deprecated.Uint8: 485 return &unsignedIntTypes[0] 486 case deprecated.Uint16: 487 return &unsignedIntTypes[1] 488 case deprecated.Uint32: 489 return &unsignedIntTypes[2] 490 case deprecated.Uint64: 491 return &unsignedIntTypes[3] 492 case deprecated.Int8: 493 return &signedIntTypes[0] 494 case deprecated.Int16: 495 return &signedIntTypes[1] 496 case deprecated.Int32: 497 return &signedIntTypes[2] 498 case deprecated.Int64: 499 return &signedIntTypes[3] 500 case deprecated.Json: 501 return &jsonType{} 502 case deprecated.Bson: 503 return &bsonType{} 504 case deprecated.Interval: 505 // TODO 506 } 507 } 508 509 if t := s.Type; t != nil { 510 // The column only has a physical type, convert it to one of the 511 // primitive types supported by this package. 512 switch kind := Kind(*t); kind { 513 case Boolean: 514 return BooleanType 515 case Int32: 516 return Int32Type 517 case Int64: 518 return Int64Type 519 case Int96: 520 return Int96Type 521 case Float: 522 return FloatType 523 case Double: 524 return DoubleType 525 case ByteArray: 526 return ByteArrayType 527 case FixedLenByteArray: 528 if s.TypeLength != nil { 529 return FixedLenByteArrayType(int(*s.TypeLength)) 530 } 531 } 532 } 533 534 // If we reach this point, we are likely reading a parquet column that was 535 // written with a non-standard type or is in a newer version of the format 536 // than this package supports. 537 return &nullType{} 538 } 539 540 func schemaRepetitionTypeOf(s *format.SchemaElement) format.FieldRepetitionType { 541 if s.RepetitionType != nil { 542 return *s.RepetitionType 543 } 544 return format.Required 545 } 546 547 func (c *Column) decompress(compressedPageData []byte, uncompressedPageSize int32) (page *buffer, err error) { 548 page = buffers.get(int(uncompressedPageSize)) 549 page.data, err = c.compression.Decode(page.data, compressedPageData) 550 if err != nil { 551 page.unref() 552 page = nil 553 } 554 return page, err 555 } 556 557 // DecodeDataPageV1 decodes a data page from the header, compressed data, and 558 // optional dictionary passed as arguments. 559 func (c *Column) DecodeDataPageV1(header DataPageHeaderV1, page []byte, dict Dictionary) (Page, error) { 560 return c.decodeDataPageV1(header, &buffer{data: page}, dict, -1) 561 } 562 563 func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *buffer, dict Dictionary, size int32) (Page, error) { 564 var pageData = page.data 565 var err error 566 567 if isCompressed(c.compression) { 568 if page, err = c.decompress(pageData, size); err != nil { 569 return nil, fmt.Errorf("decompressing data page v1: %w", err) 570 } 571 defer page.unref() 572 pageData = page.data 573 } 574 575 var numValues = int(header.NumValues()) 576 var repetitionLevels *buffer 577 var definitionLevels *buffer 578 579 if c.maxRepetitionLevel > 0 { 580 encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel) 581 repetitionLevels, pageData, err = decodeLevelsV1(encoding, numValues, pageData) 582 if err != nil { 583 return nil, fmt.Errorf("decoding repetition levels of data page v1: %w", err) 584 } 585 defer repetitionLevels.unref() 586 } 587 588 if c.maxDefinitionLevel > 0 { 589 encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel) 590 definitionLevels, pageData, err = decodeLevelsV1(encoding, numValues, pageData) 591 if err != nil { 592 return nil, fmt.Errorf("decoding definition levels of data page v1: %w", err) 593 } 594 defer definitionLevels.unref() 595 596 // Data pages v1 did not embed the number of null values, 597 // so we have to compute it from the definition levels. 598 numValues -= countLevelsNotEqual(definitionLevels.data, c.maxDefinitionLevel) 599 } 600 601 return c.decodeDataPage(header, numValues, repetitionLevels, definitionLevels, page, pageData, dict) 602 } 603 604 // DecodeDataPageV2 decodes a data page from the header, compressed data, and 605 // optional dictionary passed as arguments. 606 func (c *Column) DecodeDataPageV2(header DataPageHeaderV2, page []byte, dict Dictionary) (Page, error) { 607 return c.decodeDataPageV2(header, &buffer{data: page}, dict, -1) 608 } 609 610 func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *buffer, dict Dictionary, size int32) (Page, error) { 611 var numValues = int(header.NumValues()) 612 var pageData = page.data 613 var err error 614 var repetitionLevels *buffer 615 var definitionLevels *buffer 616 617 if length := header.RepetitionLevelsByteLength(); length > 0 { 618 if c.maxRepetitionLevel == 0 { 619 // In some cases we've observed files which have a non-zero 620 // repetition level despite the column not being repeated 621 // (nor nested within a repeated column). 622 // 623 // See https://github.com/apache/parquet-testing/pull/24 624 pageData, err = skipLevelsV2(pageData, length) 625 } else { 626 encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel) 627 repetitionLevels, pageData, err = decodeLevelsV2(encoding, numValues, pageData, length) 628 } 629 if err != nil { 630 return nil, fmt.Errorf("decoding repetition levels of data page v2: %w", io.ErrUnexpectedEOF) 631 } 632 if repetitionLevels != nil { 633 defer repetitionLevels.unref() 634 } 635 } 636 637 if length := header.DefinitionLevelsByteLength(); length > 0 { 638 if c.maxDefinitionLevel == 0 { 639 pageData, err = skipLevelsV2(pageData, length) 640 } else { 641 encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel) 642 definitionLevels, pageData, err = decodeLevelsV2(encoding, numValues, pageData, length) 643 } 644 if err != nil { 645 return nil, fmt.Errorf("decoding definition levels of data page v2: %w", io.ErrUnexpectedEOF) 646 } 647 if definitionLevels != nil { 648 defer definitionLevels.unref() 649 } 650 } 651 652 if isCompressed(c.compression) && header.IsCompressed() { 653 if page, err = c.decompress(pageData, size); err != nil { 654 return nil, fmt.Errorf("decompressing data page v2: %w", err) 655 } 656 defer page.unref() 657 pageData = page.data 658 } 659 660 numValues -= int(header.NumNulls()) 661 return c.decodeDataPage(header, numValues, repetitionLevels, definitionLevels, page, pageData, dict) 662 } 663 664 func (c *Column) decodeDataPage(header DataPageHeader, numValues int, repetitionLevels, definitionLevels, page *buffer, data []byte, dict Dictionary) (Page, error) { 665 pageEncoding := LookupEncoding(header.Encoding()) 666 pageType := c.Type() 667 668 if isDictionaryEncoding(pageEncoding) { 669 // In some legacy configurations, the PLAIN_DICTIONARY encoding is used 670 // on data page headers to indicate that the page contains indexes into 671 // the dictionary page, but the page is still encoded using the RLE 672 // encoding in this case, so we convert it to RLE_DICTIONARY. 673 pageEncoding = &RLEDictionary 674 pageType = indexedPageType{newIndexedType(pageType, dict)} 675 } 676 677 var vbuf, obuf *buffer 678 var pageValues []byte 679 var pageOffsets []uint32 680 681 if pageEncoding.CanDecodeInPlace() { 682 vbuf = page 683 pageValues = data 684 } else { 685 vbuf = buffers.get(pageType.EstimateDecodeSize(numValues, data, pageEncoding)) 686 defer vbuf.unref() 687 pageValues = vbuf.data 688 } 689 690 // Page offsets not needed when dictionary-encoded 691 if pageType.Kind() == ByteArray && !isDictionaryEncoding(pageEncoding) { 692 obuf = buffers.get(4 * (numValues + 1)) 693 defer obuf.unref() 694 pageOffsets = unsafecast.BytesToUint32(obuf.data) 695 } 696 697 values := pageType.NewValues(pageValues, pageOffsets) 698 values, err := pageType.Decode(values, data, pageEncoding) 699 if err != nil { 700 return nil, err 701 } 702 703 newPage := pageType.NewPage(c.Index(), numValues, values) 704 switch { 705 case c.maxRepetitionLevel > 0: 706 newPage = newRepeatedPage( 707 newPage, 708 c.maxRepetitionLevel, 709 c.maxDefinitionLevel, 710 repetitionLevels.data, 711 definitionLevels.data, 712 ) 713 case c.maxDefinitionLevel > 0: 714 newPage = newOptionalPage( 715 newPage, 716 c.maxDefinitionLevel, 717 definitionLevels.data, 718 ) 719 } 720 721 return newBufferedPage(newPage, vbuf, obuf, repetitionLevels, definitionLevels), nil 722 } 723 724 func decodeLevelsV1(enc encoding.Encoding, numValues int, data []byte) (*buffer, []byte, error) { 725 if len(data) < 4 { 726 return nil, data, io.ErrUnexpectedEOF 727 } 728 i := 4 729 j := 4 + int(binary.LittleEndian.Uint32(data)) 730 if j > len(data) { 731 return nil, data, io.ErrUnexpectedEOF 732 } 733 levels, err := decodeLevels(enc, numValues, data[i:j]) 734 return levels, data[j:], err 735 } 736 737 func decodeLevelsV2(enc encoding.Encoding, numValues int, data []byte, length int64) (*buffer, []byte, error) { 738 levels, err := decodeLevels(enc, numValues, data[:length]) 739 return levels, data[length:], err 740 } 741 742 func decodeLevels(enc encoding.Encoding, numValues int, data []byte) (levels *buffer, err error) { 743 levels = buffers.get(numValues) 744 levels.data, err = enc.DecodeLevels(levels.data, data) 745 if err != nil { 746 levels.unref() 747 levels = nil 748 } else { 749 switch { 750 case len(levels.data) < numValues: 751 err = fmt.Errorf("decoding level expected %d values but got only %d", numValues, len(levels.data)) 752 case len(levels.data) > numValues: 753 levels.data = levels.data[:numValues] 754 } 755 } 756 return levels, err 757 } 758 759 func skipLevelsV2(data []byte, length int64) ([]byte, error) { 760 if length >= int64(len(data)) { 761 return data, io.ErrUnexpectedEOF 762 } 763 return data[length:], nil 764 } 765 766 // DecodeDictionary decodes a data page from the header and compressed data 767 // passed as arguments. 768 func (c *Column) DecodeDictionary(header DictionaryPageHeader, page []byte) (Dictionary, error) { 769 return c.decodeDictionary(header, &buffer{data: page}, -1) 770 } 771 772 func (c *Column) decodeDictionary(header DictionaryPageHeader, page *buffer, size int32) (Dictionary, error) { 773 pageData := page.data 774 775 if isCompressed(c.compression) { 776 var err error 777 if page, err = c.decompress(pageData, size); err != nil { 778 return nil, fmt.Errorf("decompressing dictionary page: %w", err) 779 } 780 defer page.unref() 781 pageData = page.data 782 } 783 784 pageType := c.Type() 785 pageEncoding := header.Encoding() 786 if pageEncoding == format.PlainDictionary { 787 pageEncoding = format.Plain 788 } 789 790 // Dictionaries always have PLAIN encoding, so we need to allocate offsets for the decoded page. 791 numValues := int(header.NumValues()) 792 dictBufferSize := pageType.EstimateDecodeSize(numValues, pageData, LookupEncoding(pageEncoding)) 793 values := pageType.NewValues(make([]byte, 0, dictBufferSize), make([]uint32, 0, numValues)) 794 values, err := pageType.Decode(values, pageData, LookupEncoding(pageEncoding)) 795 if err != nil { 796 return nil, err 797 } 798 return pageType.NewDictionary(int(c.index), numValues, values), nil 799 } 800 801 var ( 802 _ Node = (*Column)(nil) 803 )