github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/file.go (about) 1 package parquet 2 3 import ( 4 "bufio" 5 "encoding/binary" 6 "fmt" 7 "hash/crc32" 8 "io" 9 "sort" 10 "strings" 11 "sync" 12 13 "github.com/segmentio/encoding/thrift" 14 "github.com/segmentio/parquet-go/format" 15 ) 16 17 const ( 18 defaultDictBufferSize = 8192 19 defaultReadBufferSize = 4096 20 ) 21 22 // File represents a parquet file. The layout of a Parquet file can be found 23 // here: https://github.com/apache/parquet-format#file-format 24 type File struct { 25 metadata format.FileMetaData 26 protocol thrift.CompactProtocol 27 reader io.ReaderAt 28 size int64 29 schema *Schema 30 root *Column 31 columnIndexes []format.ColumnIndex 32 offsetIndexes []format.OffsetIndex 33 rowGroups []RowGroup 34 config *FileConfig 35 } 36 37 // OpenFile opens a parquet file and reads the content between offset 0 and the given 38 // size in r. 39 // 40 // Only the parquet magic bytes and footer are read, column chunks and other 41 // parts of the file are left untouched; this means that successfully opening 42 // a file does not validate that the pages have valid checksums. 43 func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { 44 b := make([]byte, 8) 45 c, err := NewFileConfig(options...) 46 if err != nil { 47 return nil, err 48 } 49 f := &File{reader: r, size: size, config: c} 50 51 if _, err := r.ReadAt(b[:4], 0); err != nil { 52 return nil, fmt.Errorf("reading magic header of parquet file: %w", err) 53 } 54 if string(b[:4]) != "PAR1" { 55 return nil, fmt.Errorf("invalid magic header of parquet file: %q", b[:4]) 56 } 57 58 if cast, ok := f.reader.(interface{ SetMagicFooterSection(offset, length int64) }); ok { 59 cast.SetMagicFooterSection(size-8, 8) 60 } 61 if n, err := r.ReadAt(b[:8], size-8); n != 8 { 62 return nil, fmt.Errorf("reading magic footer of parquet file: %w", err) 63 } 64 if string(b[4:8]) != "PAR1" { 65 return nil, fmt.Errorf("invalid magic footer of parquet file: %q", b[4:8]) 66 } 67 68 footerSize := int64(binary.LittleEndian.Uint32(b[:4])) 69 footerData := make([]byte, footerSize) 70 71 if cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok { 72 cast.SetFooterSection(size-(footerSize+8), footerSize) 73 } 74 if _, err := f.reader.ReadAt(footerData, size-(footerSize+8)); err != nil { 75 return nil, fmt.Errorf("reading footer of parquet file: %w", err) 76 } 77 if err := thrift.Unmarshal(&f.protocol, footerData, &f.metadata); err != nil { 78 return nil, fmt.Errorf("reading parquet file metadata: %w", err) 79 } 80 if len(f.metadata.Schema) == 0 { 81 return nil, ErrMissingRootColumn 82 } 83 84 if !c.SkipPageIndex { 85 if f.columnIndexes, f.offsetIndexes, err = f.ReadPageIndex(); err != nil { 86 return nil, fmt.Errorf("reading page index of parquet file: %w", err) 87 } 88 } 89 90 if f.root, err = openColumns(f); err != nil { 91 return nil, fmt.Errorf("opening columns of parquet file: %w", err) 92 } 93 94 var schema *Schema 95 if c.Schema != nil { 96 schema = c.Schema 97 } else { 98 schema = NewSchema(f.root.Name(), f.root) 99 } 100 columns := make([]*Column, 0, numLeafColumnsOf(f.root)) 101 f.schema = schema 102 f.root.forEachLeaf(func(c *Column) { columns = append(columns, c) }) 103 104 rowGroups := make([]fileRowGroup, len(f.metadata.RowGroups)) 105 for i := range rowGroups { 106 rowGroups[i].init(f, schema, columns, &f.metadata.RowGroups[i]) 107 } 108 f.rowGroups = make([]RowGroup, len(rowGroups)) 109 for i := range rowGroups { 110 f.rowGroups[i] = &rowGroups[i] 111 } 112 113 if !c.SkipBloomFilters { 114 section := io.NewSectionReader(r, 0, size) 115 rbuf, rbufpool := getBufioReader(section, c.ReadBufferSize) 116 defer putBufioReader(rbuf, rbufpool) 117 118 header := format.BloomFilterHeader{} 119 compact := thrift.CompactProtocol{} 120 decoder := thrift.NewDecoder(compact.NewReader(rbuf)) 121 122 for i := range rowGroups { 123 g := &rowGroups[i] 124 125 for j := range g.columns { 126 c := g.columns[j].(*fileColumnChunk) 127 128 if offset := c.chunk.MetaData.BloomFilterOffset; offset > 0 { 129 section.Seek(offset, io.SeekStart) 130 rbuf.Reset(section) 131 132 header = format.BloomFilterHeader{} 133 if err := decoder.Decode(&header); err != nil { 134 return nil, fmt.Errorf("decoding bloom filter header: %w", err) 135 } 136 137 offset, _ = section.Seek(0, io.SeekCurrent) 138 offset -= int64(rbuf.Buffered()) 139 140 if cast, ok := r.(interface{ SetBloomFilterSection(offset, length int64) }); ok { 141 bloomFilterOffset := c.chunk.MetaData.BloomFilterOffset 142 bloomFilterLength := (offset - bloomFilterOffset) + int64(header.NumBytes) 143 cast.SetBloomFilterSection(bloomFilterOffset, bloomFilterLength) 144 } 145 146 c.bloomFilter = newBloomFilter(r, offset, &header) 147 } 148 } 149 } 150 } 151 152 sortKeyValueMetadata(f.metadata.KeyValueMetadata) 153 return f, nil 154 } 155 156 // ReadPageIndex reads the page index section of the parquet file f. 157 // 158 // If the file did not contain a page index, the method returns two empty slices 159 // and a nil error. 160 // 161 // Only leaf columns have indexes, the returned indexes are arranged using the 162 // following layout: 163 // 164 // ------------------ 165 // | col 0: chunk 0 | 166 // ------------------ 167 // | col 1: chunk 0 | 168 // ------------------ 169 // | ... | 170 // ------------------ 171 // | col 0: chunk 1 | 172 // ------------------ 173 // | col 1: chunk 1 | 174 // ------------------ 175 // | ... | 176 // ------------------ 177 // 178 // This method is useful in combination with the SkipPageIndex option to delay 179 // reading the page index section until after the file was opened. Note that in 180 // this case the page index is not cached within the file, programs are expected 181 // to make use of independently from the parquet package. 182 func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error) { 183 if len(f.metadata.RowGroups) == 0 { 184 return nil, nil, nil 185 } 186 187 columnIndexOffset := f.metadata.RowGroups[0].Columns[0].ColumnIndexOffset 188 offsetIndexOffset := f.metadata.RowGroups[0].Columns[0].OffsetIndexOffset 189 columnIndexLength := int64(0) 190 offsetIndexLength := int64(0) 191 192 forEachColumnChunk := func(do func(int, int, *format.ColumnChunk) error) error { 193 for i := range f.metadata.RowGroups { 194 for j := range f.metadata.RowGroups[i].Columns { 195 c := &f.metadata.RowGroups[i].Columns[j] 196 if err := do(i, j, c); err != nil { 197 return err 198 } 199 } 200 } 201 return nil 202 } 203 204 forEachColumnChunk(func(_, _ int, c *format.ColumnChunk) error { 205 columnIndexLength += int64(c.ColumnIndexLength) 206 offsetIndexLength += int64(c.OffsetIndexLength) 207 return nil 208 }) 209 210 if columnIndexLength == 0 && offsetIndexLength == 0 { 211 return nil, nil, nil 212 } 213 214 numRowGroups := len(f.metadata.RowGroups) 215 numColumns := len(f.metadata.RowGroups[0].Columns) 216 numColumnChunks := numRowGroups * numColumns 217 218 columnIndexes := make([]format.ColumnIndex, numColumnChunks) 219 offsetIndexes := make([]format.OffsetIndex, numColumnChunks) 220 indexBuffer := make([]byte, max(int(columnIndexLength), int(offsetIndexLength))) 221 222 if columnIndexOffset > 0 { 223 columnIndexData := indexBuffer[:columnIndexLength] 224 225 if cast, ok := f.reader.(interface{ SetColumnIndexSection(offset, length int64) }); ok { 226 cast.SetColumnIndexSection(columnIndexOffset, columnIndexLength) 227 } 228 if _, err := f.reader.ReadAt(columnIndexData, columnIndexOffset); err != nil { 229 return nil, nil, fmt.Errorf("reading %d bytes column index at offset %d: %w", columnIndexLength, columnIndexOffset, err) 230 } 231 232 err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error { 233 // Some parquet files are missing the column index on some columns. 234 // 235 // An example of this file is testdata/alltypes_tiny_pages_plain.parquet 236 // which was added in https://github.com/apache/parquet-testing/pull/24. 237 if c.ColumnIndexOffset > 0 { 238 offset := c.ColumnIndexOffset - columnIndexOffset 239 length := int64(c.ColumnIndexLength) 240 buffer := columnIndexData[offset : offset+length] 241 if err := thrift.Unmarshal(&f.protocol, buffer, &columnIndexes[(i*numColumns)+j]); err != nil { 242 return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err) 243 } 244 } 245 return nil 246 }) 247 if err != nil { 248 return nil, nil, err 249 } 250 } 251 252 if offsetIndexOffset > 0 { 253 offsetIndexData := indexBuffer[:offsetIndexLength] 254 255 if cast, ok := f.reader.(interface{ SetOffsetIndexSection(offset, length int64) }); ok { 256 cast.SetOffsetIndexSection(offsetIndexOffset, offsetIndexLength) 257 } 258 if _, err := f.reader.ReadAt(offsetIndexData, offsetIndexOffset); err != nil { 259 return nil, nil, fmt.Errorf("reading %d bytes offset index at offset %d: %w", offsetIndexLength, offsetIndexOffset, err) 260 } 261 262 err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error { 263 if c.OffsetIndexOffset > 0 { 264 offset := c.OffsetIndexOffset - offsetIndexOffset 265 length := int64(c.OffsetIndexLength) 266 buffer := offsetIndexData[offset : offset+length] 267 if err := thrift.Unmarshal(&f.protocol, buffer, &offsetIndexes[(i*numColumns)+j]); err != nil { 268 return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err) 269 } 270 } 271 return nil 272 }) 273 if err != nil { 274 return nil, nil, err 275 } 276 } 277 278 return columnIndexes, offsetIndexes, nil 279 } 280 281 // NumRows returns the number of rows in the file. 282 func (f *File) NumRows() int64 { return f.metadata.NumRows } 283 284 // RowGroups returns the list of row groups in the file. 285 func (f *File) RowGroups() []RowGroup { return f.rowGroups } 286 287 // Root returns the root column of f. 288 func (f *File) Root() *Column { return f.root } 289 290 // Schema returns the schema of f. 291 func (f *File) Schema() *Schema { return f.schema } 292 293 // Metadata returns the metadata of f. 294 func (f *File) Metadata() *format.FileMetaData { return &f.metadata } 295 296 // Size returns the size of f (in bytes). 297 func (f *File) Size() int64 { return f.size } 298 299 // ReadAt reads bytes into b from f at the given offset. 300 // 301 // The method satisfies the io.ReaderAt interface. 302 func (f *File) ReadAt(b []byte, off int64) (int, error) { 303 if off < 0 || off >= f.size { 304 return 0, io.EOF 305 } 306 307 if limit := f.size - off; limit < int64(len(b)) { 308 n, err := f.reader.ReadAt(b[:limit], off) 309 if err == nil { 310 err = io.EOF 311 } 312 return n, err 313 } 314 315 return f.reader.ReadAt(b, off) 316 } 317 318 // ColumnIndexes returns the page index of the parquet file f. 319 // 320 // If the file did not contain a column index, the method returns an empty slice 321 // and nil error. 322 func (f *File) ColumnIndexes() []format.ColumnIndex { return f.columnIndexes } 323 324 // OffsetIndexes returns the page index of the parquet file f. 325 // 326 // If the file did not contain an offset index, the method returns an empty 327 // slice and nil error. 328 func (f *File) OffsetIndexes() []format.OffsetIndex { return f.offsetIndexes } 329 330 // Lookup returns the value associated with the given key in the file key/value 331 // metadata. 332 // 333 // The ok boolean will be true if the key was found, false otherwise. 334 func (f *File) Lookup(key string) (value string, ok bool) { 335 return lookupKeyValueMetadata(f.metadata.KeyValueMetadata, key) 336 } 337 338 func (f *File) hasIndexes() bool { 339 return f.columnIndexes != nil && f.offsetIndexes != nil 340 } 341 342 var _ io.ReaderAt = (*File)(nil) 343 344 func sortKeyValueMetadata(keyValueMetadata []format.KeyValue) { 345 sort.Slice(keyValueMetadata, func(i, j int) bool { 346 switch { 347 case keyValueMetadata[i].Key < keyValueMetadata[j].Key: 348 return true 349 case keyValueMetadata[i].Key > keyValueMetadata[j].Key: 350 return false 351 default: 352 return keyValueMetadata[i].Value < keyValueMetadata[j].Value 353 } 354 }) 355 } 356 357 func lookupKeyValueMetadata(keyValueMetadata []format.KeyValue, key string) (value string, ok bool) { 358 i := sort.Search(len(keyValueMetadata), func(i int) bool { 359 return keyValueMetadata[i].Key >= key 360 }) 361 if i == len(keyValueMetadata) || keyValueMetadata[i].Key != key { 362 return "", false 363 } 364 return keyValueMetadata[i].Value, true 365 } 366 367 type fileRowGroup struct { 368 schema *Schema 369 rowGroup *format.RowGroup 370 columns []ColumnChunk 371 sorting []SortingColumn 372 config *FileConfig 373 } 374 375 func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGroup *format.RowGroup) { 376 g.schema = schema 377 g.rowGroup = rowGroup 378 g.config = file.config 379 g.columns = make([]ColumnChunk, len(rowGroup.Columns)) 380 g.sorting = make([]SortingColumn, len(rowGroup.SortingColumns)) 381 fileColumnChunks := make([]fileColumnChunk, len(rowGroup.Columns)) 382 383 for i := range g.columns { 384 fileColumnChunks[i] = fileColumnChunk{ 385 file: file, 386 column: columns[i], 387 rowGroup: rowGroup, 388 chunk: &rowGroup.Columns[i], 389 } 390 391 if file.hasIndexes() { 392 j := (int(rowGroup.Ordinal) * len(columns)) + i 393 fileColumnChunks[i].columnIndex = &file.columnIndexes[j] 394 fileColumnChunks[i].offsetIndex = &file.offsetIndexes[j] 395 } 396 397 g.columns[i] = &fileColumnChunks[i] 398 } 399 400 for i := range g.sorting { 401 g.sorting[i] = &fileSortingColumn{ 402 column: columns[rowGroup.SortingColumns[i].ColumnIdx], 403 descending: rowGroup.SortingColumns[i].Descending, 404 nullsFirst: rowGroup.SortingColumns[i].NullsFirst, 405 } 406 } 407 } 408 409 func (g *fileRowGroup) Schema() *Schema { return g.schema } 410 func (g *fileRowGroup) NumRows() int64 { return g.rowGroup.NumRows } 411 func (g *fileRowGroup) ColumnChunks() []ColumnChunk { return g.columns } 412 func (g *fileRowGroup) SortingColumns() []SortingColumn { return g.sorting } 413 func (g *fileRowGroup) Rows() Rows { return newRowGroupRows(g, g.config.ReadMode) } 414 415 type fileSortingColumn struct { 416 column *Column 417 descending bool 418 nullsFirst bool 419 } 420 421 func (s *fileSortingColumn) Path() []string { return s.column.Path() } 422 func (s *fileSortingColumn) Descending() bool { return s.descending } 423 func (s *fileSortingColumn) NullsFirst() bool { return s.nullsFirst } 424 func (s *fileSortingColumn) String() string { 425 b := new(strings.Builder) 426 if s.nullsFirst { 427 b.WriteString("nulls_first+") 428 } 429 if s.descending { 430 b.WriteString("descending(") 431 } else { 432 b.WriteString("ascending(") 433 } 434 b.WriteString(columnPath(s.Path()).String()) 435 b.WriteString(")") 436 return b.String() 437 } 438 439 type fileColumnChunk struct { 440 file *File 441 column *Column 442 bloomFilter *bloomFilter 443 rowGroup *format.RowGroup 444 columnIndex *format.ColumnIndex 445 offsetIndex *format.OffsetIndex 446 chunk *format.ColumnChunk 447 } 448 449 func (c *fileColumnChunk) Type() Type { 450 return c.column.Type() 451 } 452 453 func (c *fileColumnChunk) Column() int { 454 return int(c.column.Index()) 455 } 456 457 func (c *fileColumnChunk) Pages() Pages { 458 r := new(filePages) 459 r.init(c) 460 return r 461 } 462 463 func (c *fileColumnChunk) ColumnIndex() ColumnIndex { 464 if c.columnIndex == nil { 465 return nil 466 } 467 return fileColumnIndex{c} 468 } 469 470 func (c *fileColumnChunk) OffsetIndex() OffsetIndex { 471 if c.offsetIndex == nil { 472 return nil 473 } 474 return (*fileOffsetIndex)(c.offsetIndex) 475 } 476 477 func (c *fileColumnChunk) BloomFilter() BloomFilter { 478 if c.bloomFilter == nil { 479 return nil 480 } 481 return c.bloomFilter 482 } 483 484 func (c *fileColumnChunk) NumValues() int64 { 485 return c.chunk.MetaData.NumValues 486 } 487 488 type filePages struct { 489 chunk *fileColumnChunk 490 rbuf *bufio.Reader 491 rbufpool *sync.Pool 492 section io.SectionReader 493 494 protocol thrift.CompactProtocol 495 decoder thrift.Decoder 496 497 baseOffset int64 498 dataOffset int64 499 dictOffset int64 500 index int 501 skip int64 502 dictionary Dictionary 503 504 bufferSize int 505 } 506 507 func (f *filePages) init(c *fileColumnChunk) { 508 f.chunk = c 509 f.baseOffset = c.chunk.MetaData.DataPageOffset 510 f.dataOffset = f.baseOffset 511 f.bufferSize = c.file.config.ReadBufferSize 512 513 if c.chunk.MetaData.DictionaryPageOffset != 0 { 514 f.baseOffset = c.chunk.MetaData.DictionaryPageOffset 515 f.dictOffset = f.baseOffset 516 } 517 518 f.section = *io.NewSectionReader(c.file, f.baseOffset, c.chunk.MetaData.TotalCompressedSize) 519 f.rbuf, f.rbufpool = getBufioReader(&f.section, f.bufferSize) 520 f.decoder.Reset(f.protocol.NewReader(f.rbuf)) 521 } 522 523 func (f *filePages) ReadPage() (Page, error) { 524 if f.chunk == nil { 525 return nil, io.EOF 526 } 527 528 header := getPageHeader() 529 defer putPageHeader(header) 530 531 for { 532 if err := f.decoder.Decode(header); err != nil { 533 return nil, err 534 } 535 data, err := f.readPage(header, f.rbuf) 536 if err != nil { 537 return nil, err 538 } 539 540 var page Page 541 switch header.Type { 542 case format.DataPageV2: 543 page, err = f.readDataPageV2(header, data) 544 case format.DataPage: 545 page, err = f.readDataPageV1(header, data) 546 case format.DictionaryPage: 547 // Sometimes parquet files do not have the dictionary page offset 548 // recorded in the column metadata. We account for this by lazily 549 // reading dictionary pages when we encounter them. 550 err = f.readDictionaryPage(header, data) 551 default: 552 err = fmt.Errorf("cannot read values of type %s from page", header.Type) 553 } 554 555 data.unref() 556 557 if err != nil { 558 return nil, fmt.Errorf("decoding page %d of column %q: %w", f.index, f.columnPath(), err) 559 } 560 561 if page == nil { 562 continue 563 } 564 565 f.index++ 566 if f.skip == 0 { 567 return page, nil 568 } 569 570 // TODO: what about pages that don't embed the number of rows? 571 // (data page v1 with no offset index in the column chunk). 572 numRows := page.NumRows() 573 574 if numRows <= f.skip { 575 Release(page) 576 } else { 577 tail := page.Slice(f.skip, numRows) 578 Release(page) 579 f.skip = 0 580 return tail, nil 581 } 582 583 f.skip -= numRows 584 } 585 } 586 587 func (f *filePages) readDictionary() error { 588 chunk := io.NewSectionReader(f.chunk.file, f.baseOffset, f.chunk.chunk.MetaData.TotalCompressedSize) 589 rbuf, pool := getBufioReader(chunk, f.bufferSize) 590 defer putBufioReader(rbuf, pool) 591 592 decoder := thrift.NewDecoder(f.protocol.NewReader(rbuf)) 593 594 header := getPageHeader() 595 defer putPageHeader(header) 596 597 if err := decoder.Decode(header); err != nil { 598 return err 599 } 600 601 page := buffers.get(int(header.CompressedPageSize)) 602 defer page.unref() 603 604 if _, err := io.ReadFull(rbuf, page.data); err != nil { 605 return err 606 } 607 608 return f.readDictionaryPage(header, page) 609 } 610 611 func (f *filePages) readDictionaryPage(header *format.PageHeader, page *buffer) error { 612 if header.DictionaryPageHeader == nil { 613 return ErrMissingPageHeader 614 } 615 d, err := f.chunk.column.decodeDictionary(DictionaryPageHeader{header.DictionaryPageHeader}, page, header.UncompressedPageSize) 616 if err != nil { 617 return err 618 } 619 f.dictionary = d 620 return nil 621 } 622 623 func (f *filePages) readDataPageV1(header *format.PageHeader, page *buffer) (Page, error) { 624 if header.DataPageHeader == nil { 625 return nil, ErrMissingPageHeader 626 } 627 if isDictionaryFormat(header.DataPageHeader.Encoding) && f.dictionary == nil { 628 if err := f.readDictionary(); err != nil { 629 return nil, err 630 } 631 } 632 return f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, page, f.dictionary, header.UncompressedPageSize) 633 } 634 635 func (f *filePages) readDataPageV2(header *format.PageHeader, page *buffer) (Page, error) { 636 if header.DataPageHeaderV2 == nil { 637 return nil, ErrMissingPageHeader 638 } 639 if isDictionaryFormat(header.DataPageHeaderV2.Encoding) && f.dictionary == nil { 640 // If the program seeked to a row passed the first page, the dictionary 641 // page may not have been seen, in which case we have to lazily load it 642 // from the beginning of column chunk. 643 if err := f.readDictionary(); err != nil { 644 return nil, err 645 } 646 } 647 return f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, page, f.dictionary, header.UncompressedPageSize) 648 } 649 650 func (f *filePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer, error) { 651 page := buffers.get(int(header.CompressedPageSize)) 652 defer page.unref() 653 654 if _, err := io.ReadFull(reader, page.data); err != nil { 655 return nil, err 656 } 657 658 if header.CRC != 0 { 659 headerChecksum := uint32(header.CRC) 660 bufferChecksum := crc32.ChecksumIEEE(page.data) 661 662 if headerChecksum != bufferChecksum { 663 // The parquet specs indicate that corruption errors could be 664 // handled gracefully by skipping pages, tho this may not always 665 // be practical. Depending on how the pages are consumed, 666 // missing rows may cause unpredictable behaviors in algorithms. 667 // 668 // For now, we assume these errors to be fatal, but we may 669 // revisit later and improve error handling to be more resilient 670 // to data corruption. 671 return nil, fmt.Errorf("crc32 checksum mismatch in page of column %q: want=0x%08X got=0x%08X: %w", 672 f.columnPath(), 673 headerChecksum, 674 bufferChecksum, 675 ErrCorrupted, 676 ) 677 } 678 } 679 680 page.ref() 681 return page, nil 682 } 683 684 func (f *filePages) SeekToRow(rowIndex int64) (err error) { 685 if f.chunk == nil { 686 return io.ErrClosedPipe 687 } 688 if f.chunk.offsetIndex == nil { 689 _, err = f.section.Seek(f.dataOffset-f.baseOffset, io.SeekStart) 690 f.skip = rowIndex 691 f.index = 0 692 if f.dictOffset > 0 { 693 f.index = 1 694 } 695 } else { 696 pages := f.chunk.offsetIndex.PageLocations 697 index := sort.Search(len(pages), func(i int) bool { 698 return pages[i].FirstRowIndex > rowIndex 699 }) - 1 700 if index < 0 { 701 return ErrSeekOutOfRange 702 } 703 _, err = f.section.Seek(pages[index].Offset-f.baseOffset, io.SeekStart) 704 f.skip = rowIndex - pages[index].FirstRowIndex 705 f.index = index 706 } 707 f.rbuf.Reset(&f.section) 708 return err 709 } 710 711 func (f *filePages) Close() error { 712 putBufioReader(f.rbuf, f.rbufpool) 713 f.chunk = nil 714 f.section = io.SectionReader{} 715 f.rbuf = nil 716 f.rbufpool = nil 717 f.baseOffset = 0 718 f.dataOffset = 0 719 f.dictOffset = 0 720 f.index = 0 721 f.skip = 0 722 f.dictionary = nil 723 return nil 724 } 725 726 func (f *filePages) columnPath() columnPath { 727 return columnPath(f.chunk.column.Path()) 728 } 729 730 type putBufioReaderFunc func() 731 732 var ( 733 bufioReaderPoolLock sync.Mutex 734 bufioReaderPool = map[int]*sync.Pool{} 735 ) 736 737 func getBufioReader(r io.Reader, bufferSize int) (*bufio.Reader, *sync.Pool) { 738 pool := getBufioReaderPool(bufferSize) 739 rbuf, _ := pool.Get().(*bufio.Reader) 740 if rbuf == nil { 741 rbuf = bufio.NewReaderSize(r, bufferSize) 742 } else { 743 rbuf.Reset(r) 744 } 745 return rbuf, pool 746 } 747 748 func putBufioReader(rbuf *bufio.Reader, pool *sync.Pool) { 749 if rbuf != nil && pool != nil { 750 rbuf.Reset(nil) 751 pool.Put(rbuf) 752 } 753 } 754 755 func getBufioReaderPool(size int) *sync.Pool { 756 bufioReaderPoolLock.Lock() 757 defer bufioReaderPoolLock.Unlock() 758 759 if pool := bufioReaderPool[size]; pool != nil { 760 return pool 761 } 762 763 pool := &sync.Pool{} 764 bufioReaderPool[size] = pool 765 return pool 766 } 767 768 var pageHeaderPool = &sync.Pool{} 769 770 func getPageHeader() *format.PageHeader { 771 h, _ := pageHeaderPool.Get().(*format.PageHeader) 772 if h != nil { 773 return h 774 } 775 return new(format.PageHeader) 776 } 777 778 func putPageHeader(h *format.PageHeader) { 779 if h != nil { 780 h.CRC = 0 781 pageHeaderPool.Put(h) 782 } 783 }