github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_group.go

github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_group.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  
     7  	"github.com/segmentio/parquet-go/internal/debug"
     8  )
     9  
    10  // RowGroup is an interface representing a parquet row group. From the Parquet
    11  // docs, a RowGroup is "a logical horizontal partitioning of the data into rows.
    12  // There is no physical structure that is guaranteed for a row group. A row
    13  // group consists of a column chunk for each column in the dataset."
    14  //
    15  // https://github.com/apache/parquet-format#glossary
    16  type RowGroup interface {
    17  	// Returns the number of rows in the group.
    18  	NumRows() int64
    19  
    20  	// Returns the list of column chunks in this row group. The chunks are
    21  	// ordered in the order of leaf columns from the row group's schema.
    22  	//
    23  	// If the underlying implementation is not read-only, the returned
    24  	// parquet.ColumnChunk may implement other interfaces: for example,
    25  	// parquet.ColumnBuffer if the chunk is backed by an in-memory buffer,
    26  	// or typed writer interfaces like parquet.Int32Writer depending on the
    27  	// underlying type of values that can be written to the chunk.
    28  	//
    29  	// As an optimization, the row group may return the same slice across
    30  	// multiple calls to this method. Applications should treat the returned
    31  	// slice as read-only.
    32  	ColumnChunks() []ColumnChunk
    33  
    34  	// Returns the schema of rows in the group.
    35  	Schema() *Schema
    36  
    37  	// Returns the list of sorting columns describing how rows are sorted in the
    38  	// group.
    39  	//
    40  	// The method will return an empty slice if the rows are not sorted.
    41  	SortingColumns() []SortingColumn
    42  
    43  	// Returns a reader exposing the rows of the row group.
    44  	//
    45  	// As an optimization, the returned parquet.Rows object may implement
    46  	// parquet.RowWriterTo, and test the RowWriter it receives for an
    47  	// implementation of the parquet.RowGroupWriter interface.
    48  	//
    49  	// This optimization mechanism is leveraged by the parquet.CopyRows function
    50  	// to skip the generic row-by-row copy algorithm and delegate the copy logic
    51  	// to the parquet.Rows object.
    52  	Rows() Rows
    53  }
    54  
    55  // Rows is an interface implemented by row readers returned by calling the Rows
    56  // method of RowGroup instances.
    57  //
    58  // Applications should call Close when they are done using a Rows instance in
    59  // order to release the underlying resources held by the row sequence.
    60  //
    61  // After calling Close, all attempts to read more rows will return io.EOF.
    62  type Rows interface {
    63  	RowReaderWithSchema
    64  	RowSeeker
    65  	io.Closer
    66  }
    67  
    68  // RowGroupReader is an interface implemented by types that expose sequences of
    69  // row groups to the application.
    70  type RowGroupReader interface {
    71  	ReadRowGroup() (RowGroup, error)
    72  }
    73  
    74  // RowGroupWriter is an interface implemented by types that allow the program
    75  // to write row groups.
    76  type RowGroupWriter interface {
    77  	WriteRowGroup(RowGroup) (int64, error)
    78  }
    79  
    80  // SortingColumn represents a column by which a row group is sorted.
    81  type SortingColumn interface {
    82  	// Returns the path of the column in the row group schema, omitting the name
    83  	// of the root node.
    84  	Path() []string
    85  
    86  	// Returns true if the column will sort values in descending order.
    87  	Descending() bool
    88  
    89  	// Returns true if the column will put null values at the beginning.
    90  	NullsFirst() bool
    91  }
    92  
    93  // Ascending constructs a SortingColumn value which dictates to sort the column
    94  // at the path given as argument in ascending order.
    95  func Ascending(path ...string) SortingColumn { return ascending(path) }
    96  
    97  // Descending constructs a SortingColumn value which dictates to sort the column
    98  // at the path given as argument in descending order.
    99  func Descending(path ...string) SortingColumn { return descending(path) }
   100  
   101  // NullsFirst wraps the SortingColumn passed as argument so that it instructs
   102  // the row group to place null values first in the column.
   103  func NullsFirst(sortingColumn SortingColumn) SortingColumn { return nullsFirst{sortingColumn} }
   104  
   105  type ascending []string
   106  
   107  func (asc ascending) String() string   { return fmt.Sprintf("ascending(%s)", columnPath(asc)) }
   108  func (asc ascending) Path() []string   { return asc }
   109  func (asc ascending) Descending() bool { return false }
   110  func (asc ascending) NullsFirst() bool { return false }
   111  
   112  type descending []string
   113  
   114  func (desc descending) String() string   { return fmt.Sprintf("descending(%s)", columnPath(desc)) }
   115  func (desc descending) Path() []string   { return desc }
   116  func (desc descending) Descending() bool { return true }
   117  func (desc descending) NullsFirst() bool { return false }
   118  
   119  type nullsFirst struct{ SortingColumn }
   120  
   121  func (nf nullsFirst) String() string   { return fmt.Sprintf("nulls_first+%s", nf.SortingColumn) }
   122  func (nf nullsFirst) NullsFirst() bool { return true }
   123  
   124  func searchSortingColumn(sortingColumns []SortingColumn, path columnPath) int {
   125  	// There are usually a few sorting columns in a row group, so the linear
   126  	// scan is the fastest option and works whether the sorting column list
   127  	// is sorted or not. Please revisit this decision if this code path ends
   128  	// up being more costly than necessary.
   129  	for i, sorting := range sortingColumns {
   130  		if path.equal(sorting.Path()) {
   131  			return i
   132  		}
   133  	}
   134  	return len(sortingColumns)
   135  }
   136  
   137  func sortingColumnsHavePrefix(sortingColumns, prefix []SortingColumn) bool {
   138  	if len(sortingColumns) < len(prefix) {
   139  		return false
   140  	}
   141  	for i, sortingColumn := range prefix {
   142  		if !sortingColumnsAreEqual(sortingColumns[i], sortingColumn) {
   143  			return false
   144  		}
   145  	}
   146  	return true
   147  }
   148  
   149  func sortingColumnsAreEqual(s1, s2 SortingColumn) bool {
   150  	path1 := columnPath(s1.Path())
   151  	path2 := columnPath(s2.Path())
   152  	return path1.equal(path2) && s1.Descending() == s2.Descending() && s1.NullsFirst() == s2.NullsFirst()
   153  }
   154  
   155  type rowGroup struct {
   156  	schema  *Schema
   157  	numRows int64
   158  	columns []ColumnChunk
   159  	sorting []SortingColumn
   160  }
   161  
   162  func (r *rowGroup) NumRows() int64                  { return r.numRows }
   163  func (r *rowGroup) ColumnChunks() []ColumnChunk     { return r.columns }
   164  func (r *rowGroup) SortingColumns() []SortingColumn { return r.sorting }
   165  func (r *rowGroup) Schema() *Schema                 { return r.schema }
   166  func (r *rowGroup) Rows() Rows                      { return newRowGroupRows(r, ReadModeSync) }
   167  
   168  func NewRowGroupRowReader(rowGroup RowGroup) Rows {
   169  	return newRowGroupRows(rowGroup, ReadModeSync)
   170  }
   171  
   172  type rowGroupRows struct {
   173  	rowGroup     RowGroup
   174  	buffers      []Value
   175  	readers      []Pages
   176  	columns      []columnChunkRows
   177  	inited       bool
   178  	closed       bool
   179  	done         chan<- struct{}
   180  	pageReadMode ReadMode
   181  }
   182  
   183  type columnChunkRows struct {
   184  	rows   int64
   185  	offset int32
   186  	length int32
   187  	page   Page
   188  	values ValueReader
   189  }
   190  
   191  const columnBufferSize = defaultValueBufferSize
   192  
   193  func (r *rowGroupRows) buffer(i int) []Value {
   194  	j := (i + 0) * columnBufferSize
   195  	k := (i + 1) * columnBufferSize
   196  	return r.buffers[j:k:k]
   197  }
   198  
   199  func newRowGroupRows(rowGroup RowGroup, pageReadMode ReadMode) *rowGroupRows {
   200  	return &rowGroupRows{
   201  		rowGroup:     rowGroup,
   202  		pageReadMode: pageReadMode,
   203  	}
   204  }
   205  
   206  func (r *rowGroupRows) init() {
   207  	columns := r.rowGroup.ColumnChunks()
   208  
   209  	r.buffers = make([]Value, len(columns)*columnBufferSize)
   210  	r.readers = make([]Pages, len(columns))
   211  	r.columns = make([]columnChunkRows, len(columns))
   212  
   213  	switch r.pageReadMode {
   214  	case ReadModeAsync:
   215  		done := make(chan struct{})
   216  		r.done = done
   217  		readers := make([]asyncPages, len(columns))
   218  		for i, column := range columns {
   219  			readers[i].init(column.Pages(), done)
   220  			r.readers[i] = &readers[i]
   221  		}
   222  	case ReadModeSync:
   223  		for i, column := range columns {
   224  			r.readers[i] = column.Pages()
   225  		}
   226  	default:
   227  		panic(fmt.Sprintf("parquet: invalid page read mode: %d", r.pageReadMode))
   228  	}
   229  
   230  	r.inited = true
   231  	// This finalizer is used to ensure that the goroutines started by calling
   232  	// init on the underlying page readers will be shutdown in the event that
   233  	// Close isn't called and the rowGroupRows object is garbage collected.
   234  	debug.SetFinalizer(r, func(r *rowGroupRows) { r.Close() })
   235  }
   236  
   237  func (r *rowGroupRows) clear() {
   238  	for i := range r.columns {
   239  		Release(r.columns[i].page)
   240  	}
   241  
   242  	for i := range r.columns {
   243  		r.columns[i] = columnChunkRows{}
   244  	}
   245  
   246  	for i := range r.buffers {
   247  		r.buffers[i] = Value{}
   248  	}
   249  }
   250  
   251  func (r *rowGroupRows) Reset() {
   252  	for i := range r.readers {
   253  		// Ignore errors because we are resetting the reader, if the error
   254  		// persists we will see it on the next read, and otherwise we can
   255  		// read back from the beginning.
   256  		r.readers[i].SeekToRow(0)
   257  	}
   258  	r.clear()
   259  }
   260  
   261  func (r *rowGroupRows) Close() error {
   262  	var lastErr error
   263  
   264  	if r.done != nil {
   265  		close(r.done)
   266  		r.done = nil
   267  	}
   268  
   269  	for i := range r.readers {
   270  		if err := r.readers[i].Close(); err != nil {
   271  			lastErr = err
   272  		}
   273  	}
   274  
   275  	r.clear()
   276  	r.inited = true
   277  	r.closed = true
   278  	return lastErr
   279  }
   280  
   281  func (r *rowGroupRows) SeekToRow(rowIndex int64) error {
   282  	var lastErr error
   283  
   284  	if r.closed {
   285  		return io.ErrClosedPipe
   286  	}
   287  
   288  	if !r.inited {
   289  		r.init()
   290  	}
   291  
   292  	for i := range r.readers {
   293  		if err := r.readers[i].SeekToRow(rowIndex); err != nil {
   294  			lastErr = err
   295  		}
   296  	}
   297  
   298  	r.clear()
   299  	return lastErr
   300  }
   301  
   302  func (r *rowGroupRows) ReadRows(rows []Row) (int, error) {
   303  	if r.closed {
   304  		return 0, io.EOF
   305  	}
   306  
   307  	if !r.inited {
   308  		r.init()
   309  	}
   310  
   311  	// Limit the number of rows that we read to the smallest number of rows
   312  	// remaining in the current page of each column. This is necessary because
   313  	// the pointers exposed to the returned rows need to remain valid until the
   314  	// next call to ReadRows, SeekToRow, Reset, or Close. If we release one of
   315  	// the columns' page, the rows that were already read during the ReadRows
   316  	// call would be invalidated, and might reference memory locations that have
   317  	// been reused due to pooling of page buffers.
   318  	numRows := int64(len(rows))
   319  
   320  	for i := range r.columns {
   321  		c := &r.columns[i]
   322  		// When all rows of the current page of a column have been consumed we
   323  		// have to read the next page. This will effectively invalidate all
   324  		// pointers of values previously held in the page, which is valid if
   325  		// the application respects the RowReader interface and does not retain
   326  		// parquet values without cloning them first.
   327  		for c.rows == 0 {
   328  			var err error
   329  			clearValues(r.buffer(i))
   330  
   331  			c.offset = 0
   332  			c.length = 0
   333  			c.values = nil
   334  			Release(c.page)
   335  
   336  			c.page, err = r.readers[i].ReadPage()
   337  			if err != nil {
   338  				if err != io.EOF {
   339  					return 0, err
   340  				}
   341  				break
   342  			}
   343  
   344  			c.rows = c.page.NumRows()
   345  			c.values = c.page.Values()
   346  		}
   347  
   348  		if c.rows < numRows {
   349  			numRows = c.rows
   350  		}
   351  	}
   352  
   353  	for i := range rows {
   354  		rows[i] = rows[i][:0]
   355  	}
   356  
   357  	if numRows == 0 {
   358  		return 0, io.EOF
   359  	}
   360  
   361  	n, err := r.readRows(rows[:numRows])
   362  
   363  	for i := range r.columns {
   364  		r.columns[i].rows -= int64(n)
   365  	}
   366  
   367  	return n, err
   368  }
   369  
   370  func (r *rowGroupRows) Schema() *Schema {
   371  	return r.rowGroup.Schema()
   372  }
   373  
   374  func (r *rowGroupRows) readRows(rows []Row) (int, error) {
   375  	for i := range rows {
   376  	readColumns:
   377  		for columnIndex := range r.columns {
   378  			col := &r.columns[columnIndex]
   379  			buf := r.buffer(columnIndex)
   380  
   381  			skip := int32(1)
   382  			for {
   383  				if col.offset == col.length {
   384  					n, err := col.values.ReadValues(buf)
   385  					if n == 0 {
   386  						switch err {
   387  						case nil:
   388  							err = io.ErrNoProgress
   389  						case io.EOF:
   390  							continue readColumns
   391  						}
   392  						return i, err
   393  					}
   394  					col.offset = 0
   395  					col.length = int32(n)
   396  				}
   397  
   398  				_ = buf[:col.offset]
   399  				_ = buf[:col.length]
   400  				endOffset := col.offset + skip
   401  
   402  				for endOffset < col.length && buf[endOffset].repetitionLevel != 0 {
   403  					endOffset++
   404  				}
   405  
   406  				rows[i] = append(rows[i], buf[col.offset:endOffset]...)
   407  
   408  				if col.offset = endOffset; col.offset < col.length {
   409  					break
   410  				}
   411  				skip = 0
   412  			}
   413  		}
   414  	}
   415  	return len(rows), nil
   416  }
   417  
   418  type seekRowGroup struct {
   419  	base    RowGroup
   420  	seek    int64
   421  	columns []ColumnChunk
   422  }
   423  
   424  func (g *seekRowGroup) NumRows() int64 {
   425  	return g.base.NumRows() - g.seek
   426  }
   427  
   428  func (g *seekRowGroup) ColumnChunks() []ColumnChunk {
   429  	return g.columns
   430  }
   431  
   432  func (g *seekRowGroup) Schema() *Schema {
   433  	return g.base.Schema()
   434  }
   435  
   436  func (g *seekRowGroup) SortingColumns() []SortingColumn {
   437  	return g.base.SortingColumns()
   438  }
   439  
   440  func (g *seekRowGroup) Rows() Rows {
   441  	rows := g.base.Rows()
   442  	rows.SeekToRow(g.seek)
   443  	return rows
   444  }
   445  
   446  type seekColumnChunk struct {
   447  	base ColumnChunk
   448  	seek int64
   449  }
   450  
   451  func (c *seekColumnChunk) Type() Type {
   452  	return c.base.Type()
   453  }
   454  
   455  func (c *seekColumnChunk) Column() int {
   456  	return c.base.Column()
   457  }
   458  
   459  func (c *seekColumnChunk) Pages() Pages {
   460  	pages := c.base.Pages()
   461  	pages.SeekToRow(c.seek)
   462  	return pages
   463  }
   464  
   465  func (c *seekColumnChunk) ColumnIndex() ColumnIndex {
   466  	return c.base.ColumnIndex()
   467  }
   468  
   469  func (c *seekColumnChunk) OffsetIndex() OffsetIndex {
   470  	return c.base.OffsetIndex()
   471  }
   472  
   473  func (c *seekColumnChunk) BloomFilter() BloomFilter {
   474  	return c.base.BloomFilter()
   475  }
   476  
   477  func (c *seekColumnChunk) NumValues() int64 {
   478  	return c.base.NumValues()
   479  }
   480  
   481  type emptyRowGroup struct {
   482  	schema  *Schema
   483  	columns []ColumnChunk
   484  }
   485  
   486  func newEmptyRowGroup(schema *Schema) *emptyRowGroup {
   487  	columns := schema.Columns()
   488  	rowGroup := &emptyRowGroup{
   489  		schema:  schema,
   490  		columns: make([]ColumnChunk, len(columns)),
   491  	}
   492  	emptyColumnChunks := make([]emptyColumnChunk, len(columns))
   493  	for i, column := range schema.Columns() {
   494  		leaf, _ := schema.Lookup(column...)
   495  		emptyColumnChunks[i].typ = leaf.Node.Type()
   496  		emptyColumnChunks[i].column = int16(leaf.ColumnIndex)
   497  		rowGroup.columns[i] = &emptyColumnChunks[i]
   498  	}
   499  	return rowGroup
   500  }
   501  
   502  func (g *emptyRowGroup) NumRows() int64                  { return 0 }
   503  func (g *emptyRowGroup) ColumnChunks() []ColumnChunk     { return g.columns }
   504  func (g *emptyRowGroup) Schema() *Schema                 { return g.schema }
   505  func (g *emptyRowGroup) SortingColumns() []SortingColumn { return nil }
   506  func (g *emptyRowGroup) Rows() Rows                      { return emptyRows{g.schema} }
   507  
   508  type emptyColumnChunk struct {
   509  	typ    Type
   510  	column int16
   511  }
   512  
   513  func (c *emptyColumnChunk) Type() Type               { return c.typ }
   514  func (c *emptyColumnChunk) Column() int              { return int(c.column) }
   515  func (c *emptyColumnChunk) Pages() Pages             { return emptyPages{} }
   516  func (c *emptyColumnChunk) ColumnIndex() ColumnIndex { return emptyColumnIndex{} }
   517  func (c *emptyColumnChunk) OffsetIndex() OffsetIndex { return emptyOffsetIndex{} }
   518  func (c *emptyColumnChunk) BloomFilter() BloomFilter { return emptyBloomFilter{} }
   519  func (c *emptyColumnChunk) NumValues() int64         { return 0 }
   520  
   521  type emptyBloomFilter struct{}
   522  
   523  func (emptyBloomFilter) ReadAt([]byte, int64) (int, error) { return 0, io.EOF }
   524  func (emptyBloomFilter) Size() int64                       { return 0 }
   525  func (emptyBloomFilter) Check(Value) (bool, error)         { return false, nil }
   526  
   527  type emptyRows struct{ schema *Schema }
   528  
   529  func (r emptyRows) Close() error                         { return nil }
   530  func (r emptyRows) Schema() *Schema                      { return r.schema }
   531  func (r emptyRows) ReadRows([]Row) (int, error)          { return 0, io.EOF }
   532  func (r emptyRows) SeekToRow(int64) error                { return nil }
   533  func (r emptyRows) WriteRowsTo(RowWriter) (int64, error) { return 0, nil }
   534  
   535  type emptyPages struct{}
   536  
   537  func (emptyPages) ReadPage() (Page, error) { return nil, io.EOF }
   538  func (emptyPages) SeekToRow(int64) error   { return nil }
   539  func (emptyPages) Close() error            { return nil }
   540  
   541  var (
   542  	_ RowReaderWithSchema = (*rowGroupRows)(nil)
   543  	//_ RowWriterTo         = (*rowGroupRows)(nil)
   544  
   545  	_ RowReaderWithSchema = emptyRows{}
   546  	_ RowWriterTo         = emptyRows{}
   547  )