github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/column.go (about) 1 package parquet 2 3 import ( 4 "encoding/binary" 5 "fmt" 6 "io" 7 "reflect" 8 9 "github.com/segmentio/parquet-go/compress" 10 "github.com/segmentio/parquet-go/deprecated" 11 "github.com/segmentio/parquet-go/encoding" 12 "github.com/segmentio/parquet-go/format" 13 "github.com/segmentio/parquet-go/internal/unsafecast" 14 ) 15 16 // Column represents a column in a parquet file. 17 // 18 // Methods of Column values are safe to call concurrently from multiple 19 // goroutines. 20 // 21 // Column instances satisfy the Node interface. 22 type Column struct { 23 typ Type 24 file *File 25 schema *format.SchemaElement 26 order *format.ColumnOrder 27 path columnPath 28 columns []*Column 29 chunks []*format.ColumnChunk 30 columnIndex []*format.ColumnIndex 31 offsetIndex []*format.OffsetIndex 32 encoding encoding.Encoding 33 compression compress.Codec 34 35 depth int8 36 maxRepetitionLevel byte 37 maxDefinitionLevel byte 38 index int16 39 } 40 41 // Type returns the type of the column. 42 // 43 // The returned value is unspecified if c is not a leaf column. 44 func (c *Column) Type() Type { return c.typ } 45 46 // Optional returns true if the column is optional. 47 func (c *Column) Optional() bool { return schemaRepetitionTypeOf(c.schema) == format.Optional } 48 49 // Repeated returns true if the column may repeat. 50 func (c *Column) Repeated() bool { return schemaRepetitionTypeOf(c.schema) == format.Repeated } 51 52 // Required returns true if the column is required. 53 func (c *Column) Required() bool { return schemaRepetitionTypeOf(c.schema) == format.Required } 54 55 // Leaf returns true if c is a leaf column. 56 func (c *Column) Leaf() bool { return c.index >= 0 } 57 58 // Fields returns the list of fields on the column. 59 func (c *Column) Fields() []Field { 60 fields := make([]Field, len(c.columns)) 61 for i, column := range c.columns { 62 fields[i] = column 63 } 64 return fields 65 } 66 67 // Encoding returns the encodings used by this column. 68 func (c *Column) Encoding() encoding.Encoding { return c.encoding } 69 70 // Compression returns the compression codecs used by this column. 71 func (c *Column) Compression() compress.Codec { return c.compression } 72 73 // Path of the column in the parquet schema. 74 func (c *Column) Path() []string { return c.path[1:] } 75 76 // Name returns the column name. 77 func (c *Column) Name() string { return c.schema.Name } 78 79 // Columns returns the list of child columns. 80 // 81 // The method returns the same slice across multiple calls, the program must 82 // treat it as a read-only value. 83 func (c *Column) Columns() []*Column { return c.columns } 84 85 // Column returns the child column matching the given name. 86 func (c *Column) Column(name string) *Column { 87 for _, child := range c.columns { 88 if child.Name() == name { 89 return child 90 } 91 } 92 return nil 93 } 94 95 // Pages returns a reader exposing all pages in this column, across row groups. 96 func (c *Column) Pages() Pages { 97 if c.index < 0 { 98 return emptyPages{} 99 } 100 r := &columnPages{ 101 pages: make([]filePages, len(c.file.rowGroups)), 102 } 103 for i := range r.pages { 104 r.pages[i].init(c.file.rowGroups[i].(*fileRowGroup).columns[c.index].(*fileColumnChunk)) 105 } 106 return r 107 } 108 109 type columnPages struct { 110 pages []filePages 111 index int 112 } 113 114 func (c *columnPages) ReadPage() (Page, error) { 115 for { 116 if c.index >= len(c.pages) { 117 return nil, io.EOF 118 } 119 p, err := c.pages[c.index].ReadPage() 120 if err == nil || err != io.EOF { 121 return p, err 122 } 123 c.index++ 124 } 125 } 126 127 func (c *columnPages) SeekToRow(rowIndex int64) error { 128 c.index = 0 129 130 for c.index < len(c.pages) && c.pages[c.index].chunk.rowGroup.NumRows >= rowIndex { 131 rowIndex -= c.pages[c.index].chunk.rowGroup.NumRows 132 c.index++ 133 } 134 135 if c.index < len(c.pages) { 136 if err := c.pages[c.index].SeekToRow(rowIndex); err != nil { 137 return err 138 } 139 for i := range c.pages[c.index:] { 140 p := &c.pages[c.index+i] 141 if err := p.SeekToRow(0); err != nil { 142 return err 143 } 144 } 145 } 146 return nil 147 } 148 149 func (c *columnPages) Close() error { 150 var lastErr error 151 152 for i := range c.pages { 153 if err := c.pages[i].Close(); err != nil { 154 lastErr = err 155 } 156 } 157 158 c.pages = nil 159 c.index = 0 160 return lastErr 161 } 162 163 // Depth returns the position of the column relative to the root. 164 func (c *Column) Depth() int { return int(c.depth) } 165 166 // MaxRepetitionLevel returns the maximum value of repetition levels on this 167 // column. 168 func (c *Column) MaxRepetitionLevel() int { return int(c.maxRepetitionLevel) } 169 170 // MaxDefinitionLevel returns the maximum value of definition levels on this 171 // column. 172 func (c *Column) MaxDefinitionLevel() int { return int(c.maxDefinitionLevel) } 173 174 // Index returns the position of the column in a row. Only leaf columns have a 175 // column index, the method returns -1 when called on non-leaf columns. 176 func (c *Column) Index() int { return int(c.index) } 177 178 // GoType returns the Go type that best represents the parquet column. 179 func (c *Column) GoType() reflect.Type { return goTypeOf(c) } 180 181 // Value returns the sub-value in base for the child column at the given 182 // index. 183 func (c *Column) Value(base reflect.Value) reflect.Value { 184 return base.MapIndex(reflect.ValueOf(&c.schema.Name).Elem()) 185 } 186 187 // String returns a human-readable string representation of the column. 188 func (c *Column) String() string { return c.path.String() + ": " + sprint(c.Name(), c) } 189 190 func (c *Column) forEachLeaf(do func(*Column)) { 191 if len(c.columns) == 0 { 192 do(c) 193 } else { 194 for _, child := range c.columns { 195 child.forEachLeaf(do) 196 } 197 } 198 } 199 200 func openColumns(file *File) (*Column, error) { 201 cl := columnLoader{} 202 203 c, err := cl.open(file, nil) 204 if err != nil { 205 return nil, err 206 } 207 208 // Validate that there aren't extra entries in the row group columns, 209 // which would otherwise indicate that there are dangling data pages 210 // in the file. 211 for index, rowGroup := range file.metadata.RowGroups { 212 if cl.rowGroupColumnIndex != len(rowGroup.Columns) { 213 return nil, fmt.Errorf("row group at index %d contains %d columns but %d were referenced by the column schemas", 214 index, len(rowGroup.Columns), cl.rowGroupColumnIndex) 215 } 216 } 217 218 _, err = c.setLevels(0, 0, 0, 0) 219 return c, err 220 } 221 222 func (c *Column) setLevels(depth, repetition, definition, index int) (int, error) { 223 if depth > MaxColumnDepth { 224 return -1, fmt.Errorf("cannot represent parquet columns with more than %d nested levels: %s", MaxColumnDepth, c.path) 225 } 226 if index > MaxColumnIndex { 227 return -1, fmt.Errorf("cannot represent parquet rows with more than %d columns: %s", MaxColumnIndex, c.path) 228 } 229 if repetition > MaxRepetitionLevel { 230 return -1, fmt.Errorf("cannot represent parquet columns with more than %d repetition levels: %s", MaxRepetitionLevel, c.path) 231 } 232 if definition > MaxDefinitionLevel { 233 return -1, fmt.Errorf("cannot represent parquet columns with more than %d definition levels: %s", MaxDefinitionLevel, c.path) 234 } 235 236 switch schemaRepetitionTypeOf(c.schema) { 237 case format.Optional: 238 definition++ 239 case format.Repeated: 240 repetition++ 241 definition++ 242 } 243 244 c.depth = int8(depth) 245 c.maxRepetitionLevel = byte(repetition) 246 c.maxDefinitionLevel = byte(definition) 247 depth++ 248 249 if len(c.columns) > 0 { 250 c.index = -1 251 } else { 252 c.index = int16(index) 253 index++ 254 } 255 256 var err error 257 for _, child := range c.columns { 258 if index, err = child.setLevels(depth, repetition, definition, index); err != nil { 259 return -1, err 260 } 261 } 262 return index, nil 263 } 264 265 type columnLoader struct { 266 schemaIndex int 267 columnOrderIndex int 268 rowGroupColumnIndex int 269 } 270 271 func (cl *columnLoader) open(file *File, path []string) (*Column, error) { 272 c := &Column{ 273 file: file, 274 schema: &file.metadata.Schema[cl.schemaIndex], 275 } 276 c.path = columnPath(path).append(c.schema.Name) 277 278 cl.schemaIndex++ 279 numChildren := int(c.schema.NumChildren) 280 281 if numChildren == 0 { 282 c.typ = schemaElementTypeOf(c.schema) 283 284 if cl.columnOrderIndex < len(file.metadata.ColumnOrders) { 285 c.order = &file.metadata.ColumnOrders[cl.columnOrderIndex] 286 cl.columnOrderIndex++ 287 } 288 289 rowGroups := file.metadata.RowGroups 290 rowGroupColumnIndex := cl.rowGroupColumnIndex 291 cl.rowGroupColumnIndex++ 292 293 c.chunks = make([]*format.ColumnChunk, 0, len(rowGroups)) 294 c.columnIndex = make([]*format.ColumnIndex, 0, len(rowGroups)) 295 c.offsetIndex = make([]*format.OffsetIndex, 0, len(rowGroups)) 296 297 for i, rowGroup := range rowGroups { 298 if rowGroupColumnIndex >= len(rowGroup.Columns) { 299 return nil, fmt.Errorf("row group at index %d does not have enough columns", i) 300 } 301 c.chunks = append(c.chunks, &rowGroup.Columns[rowGroupColumnIndex]) 302 } 303 304 if len(file.columnIndexes) > 0 { 305 for i := range rowGroups { 306 if rowGroupColumnIndex >= len(file.columnIndexes) { 307 return nil, fmt.Errorf("row group at index %d does not have enough column index pages", i) 308 } 309 c.columnIndex = append(c.columnIndex, &file.columnIndexes[rowGroupColumnIndex]) 310 } 311 } 312 313 if len(file.offsetIndexes) > 0 { 314 for i := range rowGroups { 315 if rowGroupColumnIndex >= len(file.offsetIndexes) { 316 return nil, fmt.Errorf("row group at index %d does not have enough offset index pages", i) 317 } 318 c.offsetIndex = append(c.offsetIndex, &file.offsetIndexes[rowGroupColumnIndex]) 319 } 320 } 321 322 if len(c.chunks) > 0 { 323 // Pick the encoding and compression codec of the first chunk. 324 // 325 // Technically each column chunk may use a different compression 326 // codec, and each page of the column chunk might have a different 327 // encoding. Exposing these details does not provide a lot of value 328 // to the end user. 329 // 330 // Programs that wish to determine the encoding and compression of 331 // each page of the column should iterate through the pages and read 332 // the page headers to determine which compression and encodings are 333 // applied. 334 for _, encoding := range c.chunks[0].MetaData.Encoding { 335 if c.encoding == nil { 336 c.encoding = LookupEncoding(encoding) 337 } 338 if encoding != format.Plain && encoding != format.RLE { 339 c.encoding = LookupEncoding(encoding) 340 break 341 } 342 } 343 c.compression = LookupCompressionCodec(c.chunks[0].MetaData.Codec) 344 } 345 346 return c, nil 347 } 348 349 c.typ = &groupType{} 350 c.columns = make([]*Column, numChildren) 351 352 for i := range c.columns { 353 if cl.schemaIndex >= len(file.metadata.Schema) { 354 return nil, fmt.Errorf("column %q has more children than there are schemas in the file: %d > %d", 355 c.schema.Name, cl.schemaIndex+1, len(file.metadata.Schema)) 356 } 357 358 var err error 359 c.columns[i], err = cl.open(file, c.path) 360 if err != nil { 361 return nil, fmt.Errorf("%s: %w", c.schema.Name, err) 362 } 363 } 364 365 return c, nil 366 } 367 368 func schemaElementTypeOf(s *format.SchemaElement) Type { 369 if lt := s.LogicalType; lt != nil { 370 // A logical type exists, the Type interface implementations in this 371 // package are all based on the logical parquet types declared in the 372 // format sub-package so we can return them directly via a pointer type 373 // conversion. 374 switch { 375 case lt.UTF8 != nil: 376 return (*stringType)(lt.UTF8) 377 case lt.Map != nil: 378 return (*mapType)(lt.Map) 379 case lt.List != nil: 380 return (*listType)(lt.List) 381 case lt.Enum != nil: 382 return (*enumType)(lt.Enum) 383 case lt.Decimal != nil: 384 // A parquet decimal can be one of several different physical types. 385 if t := s.Type; t != nil { 386 var typ Type 387 switch kind := Kind(*s.Type); kind { 388 case Int32: 389 typ = Int32Type 390 case Int64: 391 typ = Int64Type 392 case FixedLenByteArray: 393 if s.TypeLength == nil { 394 panic("DECIMAL using FIXED_LEN_BYTE_ARRAY must specify a length") 395 } 396 typ = FixedLenByteArrayType(int(*s.TypeLength)) 397 default: 398 panic("DECIMAL must be of type INT32, INT64, or FIXED_LEN_BYTE_ARRAY but got " + kind.String()) 399 } 400 return &decimalType{ 401 decimal: *lt.Decimal, 402 Type: typ, 403 } 404 } 405 case lt.Date != nil: 406 return (*dateType)(lt.Date) 407 case lt.Time != nil: 408 return (*timeType)(lt.Time) 409 case lt.Timestamp != nil: 410 return (*timestampType)(lt.Timestamp) 411 case lt.Integer != nil: 412 return (*intType)(lt.Integer) 413 case lt.Unknown != nil: 414 return (*nullType)(lt.Unknown) 415 case lt.Json != nil: 416 return (*jsonType)(lt.Json) 417 case lt.Bson != nil: 418 return (*bsonType)(lt.Bson) 419 case lt.UUID != nil: 420 return (*uuidType)(lt.UUID) 421 } 422 } 423 424 if ct := s.ConvertedType; ct != nil { 425 // This column contains no logical type but has a converted type, it 426 // was likely created by an older parquet writer. Convert the legacy 427 // type representation to the equivalent logical parquet type. 428 switch *ct { 429 case deprecated.UTF8: 430 return &stringType{} 431 case deprecated.Map: 432 return &mapType{} 433 case deprecated.MapKeyValue: 434 return &groupType{} 435 case deprecated.List: 436 return &listType{} 437 case deprecated.Enum: 438 return &enumType{} 439 case deprecated.Decimal: 440 if s.Scale != nil && s.Precision != nil { 441 // A parquet decimal can be one of several different physical types. 442 if t := s.Type; t != nil { 443 var typ Type 444 switch kind := Kind(*s.Type); kind { 445 case Int32: 446 typ = Int32Type 447 case Int64: 448 typ = Int64Type 449 case FixedLenByteArray: 450 if s.TypeLength == nil { 451 panic("DECIMAL using FIXED_LEN_BYTE_ARRAY must specify a length") 452 } 453 typ = FixedLenByteArrayType(int(*s.TypeLength)) 454 case ByteArray: 455 typ = ByteArrayType 456 default: 457 panic("DECIMAL must be of type INT32, INT64, BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY but got " + kind.String()) 458 } 459 return &decimalType{ 460 decimal: format.DecimalType{ 461 Scale: *s.Scale, 462 Precision: *s.Precision, 463 }, 464 Type: typ, 465 } 466 } 467 } 468 case deprecated.Date: 469 return &dateType{} 470 case deprecated.TimeMillis: 471 return &timeType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()} 472 case deprecated.TimeMicros: 473 return &timeType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()} 474 case deprecated.TimestampMillis: 475 return ×tampType{IsAdjustedToUTC: true, Unit: Millisecond.TimeUnit()} 476 case deprecated.TimestampMicros: 477 return ×tampType{IsAdjustedToUTC: true, Unit: Microsecond.TimeUnit()} 478 case deprecated.Uint8: 479 return &unsignedIntTypes[0] 480 case deprecated.Uint16: 481 return &unsignedIntTypes[1] 482 case deprecated.Uint32: 483 return &unsignedIntTypes[2] 484 case deprecated.Uint64: 485 return &unsignedIntTypes[3] 486 case deprecated.Int8: 487 return &signedIntTypes[0] 488 case deprecated.Int16: 489 return &signedIntTypes[1] 490 case deprecated.Int32: 491 return &signedIntTypes[2] 492 case deprecated.Int64: 493 return &signedIntTypes[3] 494 case deprecated.Json: 495 return &jsonType{} 496 case deprecated.Bson: 497 return &bsonType{} 498 case deprecated.Interval: 499 // TODO 500 } 501 } 502 503 if t := s.Type; t != nil { 504 // The column only has a physical type, convert it to one of the 505 // primitive types supported by this package. 506 switch kind := Kind(*t); kind { 507 case Boolean: 508 return BooleanType 509 case Int32: 510 return Int32Type 511 case Int64: 512 return Int64Type 513 case Int96: 514 return Int96Type 515 case Float: 516 return FloatType 517 case Double: 518 return DoubleType 519 case ByteArray: 520 return ByteArrayType 521 case FixedLenByteArray: 522 if s.TypeLength != nil { 523 return FixedLenByteArrayType(int(*s.TypeLength)) 524 } 525 } 526 } 527 528 // If we reach this point, we are likely reading a parquet column that was 529 // written with a non-standard type or is in a newer version of the format 530 // than this package supports. 531 return &nullType{} 532 } 533 534 func schemaRepetitionTypeOf(s *format.SchemaElement) format.FieldRepetitionType { 535 if s.RepetitionType != nil { 536 return *s.RepetitionType 537 } 538 return format.Required 539 } 540 541 func (c *Column) decompress(compressedPageData []byte, uncompressedPageSize int32) (page *buffer, err error) { 542 page = buffers.get(int(uncompressedPageSize)) 543 page.data, err = c.compression.Decode(page.data, compressedPageData) 544 if err != nil { 545 page.unref() 546 page = nil 547 } 548 return page, err 549 } 550 551 // DecodeDataPageV1 decodes a data page from the header, compressed data, and 552 // optional dictionary passed as arguments. 553 func (c *Column) DecodeDataPageV1(header DataPageHeaderV1, page []byte, dict Dictionary) (Page, error) { 554 return c.decodeDataPageV1(header, &buffer{data: page}, dict, -1) 555 } 556 557 func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *buffer, dict Dictionary, size int32) (Page, error) { 558 var pageData = page.data 559 var err error 560 561 if isCompressed(c.compression) { 562 if page, err = c.decompress(pageData, size); err != nil { 563 return nil, fmt.Errorf("decompressing data page v1: %w", err) 564 } 565 defer page.unref() 566 pageData = page.data 567 } 568 569 var numValues = int(header.NumValues()) 570 var repetitionLevels *buffer 571 var definitionLevels *buffer 572 573 if c.maxRepetitionLevel > 0 { 574 encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel) 575 repetitionLevels, pageData, err = decodeLevelsV1(encoding, numValues, pageData) 576 if err != nil { 577 return nil, fmt.Errorf("decoding repetition levels of data page v1: %w", err) 578 } 579 defer repetitionLevels.unref() 580 } 581 582 if c.maxDefinitionLevel > 0 { 583 encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel) 584 definitionLevels, pageData, err = decodeLevelsV1(encoding, numValues, pageData) 585 if err != nil { 586 return nil, fmt.Errorf("decoding definition levels of data page v1: %w", err) 587 } 588 defer definitionLevels.unref() 589 590 // Data pages v1 did not embed the number of null values, 591 // so we have to compute it from the definition levels. 592 numValues -= countLevelsNotEqual(definitionLevels.data, c.maxDefinitionLevel) 593 } 594 595 return c.decodeDataPage(header, numValues, repetitionLevels, definitionLevels, page, pageData, dict) 596 } 597 598 // DecodeDataPageV2 decodes a data page from the header, compressed data, and 599 // optional dictionary passed as arguments. 600 func (c *Column) DecodeDataPageV2(header DataPageHeaderV2, page []byte, dict Dictionary) (Page, error) { 601 return c.decodeDataPageV2(header, &buffer{data: page}, dict, -1) 602 } 603 604 func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *buffer, dict Dictionary, size int32) (Page, error) { 605 var numValues = int(header.NumValues()) 606 var pageData = page.data 607 var err error 608 var repetitionLevels *buffer 609 var definitionLevels *buffer 610 611 if length := header.RepetitionLevelsByteLength(); length > 0 { 612 if c.maxRepetitionLevel == 0 { 613 // In some cases we've observed files which have a non-zero 614 // repetition level despite the column not being repeated 615 // (nor nested within a repeated column). 616 // 617 // See https://github.com/apache/parquet-testing/pull/24 618 pageData, err = skipLevelsV2(pageData, length) 619 } else { 620 encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel) 621 repetitionLevels, pageData, err = decodeLevelsV2(encoding, numValues, pageData, length) 622 } 623 if err != nil { 624 return nil, fmt.Errorf("decoding repetition levels of data page v2: %w", io.ErrUnexpectedEOF) 625 } 626 if repetitionLevels != nil { 627 defer repetitionLevels.unref() 628 } 629 } 630 631 if length := header.DefinitionLevelsByteLength(); length > 0 { 632 if c.maxDefinitionLevel == 0 { 633 pageData, err = skipLevelsV2(pageData, length) 634 } else { 635 encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel) 636 definitionLevels, pageData, err = decodeLevelsV2(encoding, numValues, pageData, length) 637 } 638 if err != nil { 639 return nil, fmt.Errorf("decoding definition levels of data page v2: %w", io.ErrUnexpectedEOF) 640 } 641 if definitionLevels != nil { 642 defer definitionLevels.unref() 643 } 644 } 645 646 if isCompressed(c.compression) && header.IsCompressed() { 647 if page, err = c.decompress(pageData, size); err != nil { 648 return nil, fmt.Errorf("decompressing data page v2: %w", err) 649 } 650 defer page.unref() 651 pageData = page.data 652 } 653 654 numValues -= int(header.NumNulls()) 655 return c.decodeDataPage(header, numValues, repetitionLevels, definitionLevels, page, pageData, dict) 656 } 657 658 func (c *Column) decodeDataPage(header DataPageHeader, numValues int, repetitionLevels, definitionLevels, page *buffer, data []byte, dict Dictionary) (Page, error) { 659 pageEncoding := LookupEncoding(header.Encoding()) 660 pageType := c.Type() 661 662 if isDictionaryEncoding(pageEncoding) { 663 // In some legacy configurations, the PLAIN_DICTIONARY encoding is used 664 // on data page headers to indicate that the page contains indexes into 665 // the dictionary page, but the page is still encoded using the RLE 666 // encoding in this case, so we convert it to RLE_DICTIONARY. 667 pageEncoding = &RLEDictionary 668 pageType = indexedPageType{newIndexedType(pageType, dict)} 669 } 670 671 var vbuf, obuf *buffer 672 var pageValues []byte 673 var pageOffsets []uint32 674 675 if pageEncoding.CanDecodeInPlace() { 676 vbuf = page 677 pageValues = data 678 } else { 679 vbuf = buffers.get(pageType.EstimateDecodeSize(numValues, data, pageEncoding)) 680 defer vbuf.unref() 681 pageValues = vbuf.data 682 } 683 684 // Page offsets not needed when dictionary-encoded 685 if pageType.Kind() == ByteArray && !isDictionaryEncoding(pageEncoding) { 686 obuf = buffers.get(4 * (numValues + 1)) 687 defer obuf.unref() 688 pageOffsets = unsafecast.BytesToUint32(obuf.data) 689 } 690 691 values := pageType.NewValues(pageValues, pageOffsets) 692 values, err := pageType.Decode(values, data, pageEncoding) 693 if err != nil { 694 return nil, err 695 } 696 697 newPage := pageType.NewPage(c.Index(), numValues, values) 698 switch { 699 case c.maxRepetitionLevel > 0: 700 newPage = newRepeatedPage( 701 newPage, 702 c.maxRepetitionLevel, 703 c.maxDefinitionLevel, 704 repetitionLevels.data, 705 definitionLevels.data, 706 ) 707 case c.maxDefinitionLevel > 0: 708 newPage = newOptionalPage( 709 newPage, 710 c.maxDefinitionLevel, 711 definitionLevels.data, 712 ) 713 } 714 715 return newBufferedPage(newPage, vbuf, obuf, repetitionLevels, definitionLevels), nil 716 } 717 718 func decodeLevelsV1(enc encoding.Encoding, numValues int, data []byte) (*buffer, []byte, error) { 719 if len(data) < 4 { 720 return nil, data, io.ErrUnexpectedEOF 721 } 722 i := 4 723 j := 4 + int(binary.LittleEndian.Uint32(data)) 724 if j > len(data) { 725 return nil, data, io.ErrUnexpectedEOF 726 } 727 levels, err := decodeLevels(enc, numValues, data[i:j]) 728 return levels, data[j:], err 729 } 730 731 func decodeLevelsV2(enc encoding.Encoding, numValues int, data []byte, length int64) (*buffer, []byte, error) { 732 levels, err := decodeLevels(enc, numValues, data[:length]) 733 return levels, data[length:], err 734 } 735 736 func decodeLevels(enc encoding.Encoding, numValues int, data []byte) (levels *buffer, err error) { 737 levels = buffers.get(numValues) 738 levels.data, err = enc.DecodeLevels(levels.data, data) 739 if err != nil { 740 levels.unref() 741 levels = nil 742 } else { 743 switch { 744 case len(levels.data) < numValues: 745 err = fmt.Errorf("decoding level expected %d values but got only %d", numValues, len(levels.data)) 746 case len(levels.data) > numValues: 747 levels.data = levels.data[:numValues] 748 } 749 } 750 return levels, err 751 } 752 753 func skipLevelsV2(data []byte, length int64) ([]byte, error) { 754 if length >= int64(len(data)) { 755 return data, io.ErrUnexpectedEOF 756 } 757 return data[length:], nil 758 } 759 760 // DecodeDictionary decodes a data page from the header and compressed data 761 // passed as arguments. 762 func (c *Column) DecodeDictionary(header DictionaryPageHeader, page []byte) (Dictionary, error) { 763 return c.decodeDictionary(header, &buffer{data: page}, -1) 764 } 765 766 func (c *Column) decodeDictionary(header DictionaryPageHeader, page *buffer, size int32) (Dictionary, error) { 767 pageData := page.data 768 769 if isCompressed(c.compression) { 770 var err error 771 if page, err = c.decompress(pageData, size); err != nil { 772 return nil, fmt.Errorf("decompressing dictionary page: %w", err) 773 } 774 defer page.unref() 775 pageData = page.data 776 } 777 778 pageType := c.Type() 779 pageEncoding := header.Encoding() 780 if pageEncoding == format.PlainDictionary { 781 pageEncoding = format.Plain 782 } 783 784 numValues := int(header.NumValues()) 785 values := pageType.NewValues(nil, nil) 786 values, err := pageType.Decode(values, pageData, LookupEncoding(pageEncoding)) 787 if err != nil { 788 return nil, err 789 } 790 return pageType.NewDictionary(int(c.index), numValues, values), nil 791 } 792 793 var ( 794 _ Node = (*Column)(nil) 795 )