github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/row_group.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  )
     7  
     8  // RowGroup is an interface representing a parquet row group. From the Parquet
     9  // docs, a RowGroup is "a logical horizontal partitioning of the data into rows.
    10  // There is no physical structure that is guaranteed for a row group. A row
    11  // group consists of a column chunk for each column in the dataset."
    12  //
    13  // https://github.com/apache/parquet-format#glossary
    14  type RowGroup interface {
    15  	// Returns the number of rows in the group.
    16  	NumRows() int64
    17  
    18  	// Returns the list of column chunks in this row group. The chunks are
    19  	// ordered in the order of leaf columns from the row group's schema.
    20  	//
    21  	// If the underlying implementation is not read-only, the returned
    22  	// parquet.ColumnChunk may implement other interfaces: for example,
    23  	// parquet.ColumnBuffer if the chunk is backed by an in-memory buffer,
    24  	// or typed writer interfaces like parquet.Int32Writer depending on the
    25  	// underlying type of values that can be written to the chunk.
    26  	//
    27  	// As an optimization, the row group may return the same slice across
    28  	// multiple calls to this method. Applications should treat the returned
    29  	// slice as read-only.
    30  	ColumnChunks() []ColumnChunk
    31  
    32  	// Returns the schema of rows in the group.
    33  	Schema() *Schema
    34  
    35  	// Returns the list of sorting columns describing how rows are sorted in the
    36  	// group.
    37  	//
    38  	// The method will return an empty slice if the rows are not sorted.
    39  	SortingColumns() []SortingColumn
    40  
    41  	// Returns a reader exposing the rows of the row group.
    42  	//
    43  	// As an optimization, the returned parquet.Rows object may implement
    44  	// parquet.RowWriterTo, and test the RowWriter it receives for an
    45  	// implementation of the parquet.RowGroupWriter interface.
    46  	//
    47  	// This optimization mechanism is leveraged by the parquet.CopyRows function
    48  	// to skip the generic row-by-row copy algorithm and delegate the copy logic
    49  	// to the parquet.Rows object.
    50  	Rows() Rows
    51  }
    52  
    53  // Rows is an interface implemented by row readers returned by calling the Rows
    54  // method of RowGroup instances.
    55  //
    56  // Applications should call Close when they are done using a Rows instance in
    57  // order to release the underlying resources held by the row sequence.
    58  //
    59  // After calling Close, all attempts to read more rows will return io.EOF.
    60  type Rows interface {
    61  	RowReaderWithSchema
    62  	RowSeeker
    63  	io.Closer
    64  }
    65  
    66  // RowGroupReader is an interface implemented by types that expose sequences of
    67  // row groups to the application.
    68  type RowGroupReader interface {
    69  	ReadRowGroup() (RowGroup, error)
    70  }
    71  
    72  // RowGroupWriter is an interface implemented by types that allow the program
    73  // to write row groups.
    74  type RowGroupWriter interface {
    75  	WriteRowGroup(RowGroup) (int64, error)
    76  }
    77  
    78  // SortingColumn represents a column by which a row group is sorted.
    79  type SortingColumn interface {
    80  	// Returns the path of the column in the row group schema, omitting the name
    81  	// of the root node.
    82  	Path() []string
    83  
    84  	// Returns true if the column will sort values in descending order.
    85  	Descending() bool
    86  
    87  	// Returns true if the column will put null values at the beginning.
    88  	NullsFirst() bool
    89  }
    90  
    91  // Ascending constructs a SortingColumn value which dictates to sort the column
    92  // at the path given as argument in ascending order.
    93  func Ascending(path ...string) SortingColumn { return ascending(path) }
    94  
    95  // Descending constructs a SortingColumn value which dictates to sort the column
    96  // at the path given as argument in descending order.
    97  func Descending(path ...string) SortingColumn { return descending(path) }
    98  
    99  // NullsFirst wraps the SortingColumn passed as argument so that it instructs
   100  // the row group to place null values first in the column.
   101  func NullsFirst(sortingColumn SortingColumn) SortingColumn { return nullsFirst{sortingColumn} }
   102  
   103  type ascending []string
   104  
   105  func (asc ascending) String() string   { return fmt.Sprintf("ascending(%s)", columnPath(asc)) }
   106  func (asc ascending) Path() []string   { return asc }
   107  func (asc ascending) Descending() bool { return false }
   108  func (asc ascending) NullsFirst() bool { return false }
   109  
   110  type descending []string
   111  
   112  func (desc descending) String() string   { return fmt.Sprintf("descending(%s)", columnPath(desc)) }
   113  func (desc descending) Path() []string   { return desc }
   114  func (desc descending) Descending() bool { return true }
   115  func (desc descending) NullsFirst() bool { return false }
   116  
   117  type nullsFirst struct{ SortingColumn }
   118  
   119  func (nf nullsFirst) String() string   { return fmt.Sprintf("nulls_first+%s", nf.SortingColumn) }
   120  func (nf nullsFirst) NullsFirst() bool { return true }
   121  
   122  func searchSortingColumn(sortingColumns []SortingColumn, path columnPath) int {
   123  	// There are usually a few sorting columns in a row group, so the linear
   124  	// scan is the fastest option and works whether the sorting column list
   125  	// is sorted or not. Please revisit this decision if this code path ends
   126  	// up being more costly than necessary.
   127  	for i, sorting := range sortingColumns {
   128  		if path.equal(sorting.Path()) {
   129  			return i
   130  		}
   131  	}
   132  	return len(sortingColumns)
   133  }
   134  
   135  func sortingColumnsHavePrefix(sortingColumns, prefix []SortingColumn) bool {
   136  	if len(sortingColumns) < len(prefix) {
   137  		return false
   138  	}
   139  	for i, sortingColumn := range prefix {
   140  		if !sortingColumnsAreEqual(sortingColumns[i], sortingColumn) {
   141  			return false
   142  		}
   143  	}
   144  	return true
   145  }
   146  
   147  func sortingColumnsAreEqual(s1, s2 SortingColumn) bool {
   148  	path1 := columnPath(s1.Path())
   149  	path2 := columnPath(s2.Path())
   150  	return path1.equal(path2) && s1.Descending() == s2.Descending() && s1.NullsFirst() == s2.NullsFirst()
   151  }
   152  
   153  // MergeRowGroups constructs a row group which is a merged view of rowGroups. If
   154  // rowGroups are sorted and the passed options include sorting, the merged row
   155  // group will also be sorted.
   156  //
   157  // The function validates the input to ensure that the merge operation is
   158  // possible, ensuring that the schemas match or can be converted to an
   159  // optionally configured target schema passed as argument in the option list.
   160  //
   161  // The sorting columns of each row group are also consulted to determine whether
   162  // the output can be represented. If sorting columns are configured on the merge
   163  // they must be a prefix of sorting columns of all row groups being merged.
   164  func MergeRowGroups(rowGroups []RowGroup, options ...RowGroupOption) (RowGroup, error) {
   165  	config, err := NewRowGroupConfig(options...)
   166  	if err != nil {
   167  		return nil, err
   168  	}
   169  
   170  	schema := config.Schema
   171  	if len(rowGroups) == 0 {
   172  		return newEmptyRowGroup(schema), nil
   173  	}
   174  	if schema == nil {
   175  		schema = rowGroups[0].Schema()
   176  
   177  		for _, rowGroup := range rowGroups[1:] {
   178  			if !nodesAreEqual(schema, rowGroup.Schema()) {
   179  				return nil, ErrRowGroupSchemaMismatch
   180  			}
   181  		}
   182  	}
   183  
   184  	mergedRowGroups := make([]RowGroup, len(rowGroups))
   185  	copy(mergedRowGroups, rowGroups)
   186  
   187  	for i, rowGroup := range mergedRowGroups {
   188  		if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) {
   189  			conv, err := Convert(schema, rowGroupSchema)
   190  			if err != nil {
   191  				return nil, fmt.Errorf("cannot merge row groups: %w", err)
   192  			}
   193  			mergedRowGroups[i] = ConvertRowGroup(rowGroup, conv)
   194  		}
   195  	}
   196  
   197  	m := &mergedRowGroup{sorting: config.SortingColumns}
   198  	m.init(schema, mergedRowGroups)
   199  
   200  	if len(m.sorting) == 0 {
   201  		// When the row group has no ordering, use a simpler version of the
   202  		// merger which simply concatenates rows from each of the row groups.
   203  		// This is preferable because it makes the output deterministic, the
   204  		// heap merge may otherwise reorder rows across groups.
   205  		return &m.multiRowGroup, nil
   206  	}
   207  
   208  	for _, rowGroup := range m.rowGroups {
   209  		if !sortingColumnsHavePrefix(rowGroup.SortingColumns(), m.sorting) {
   210  			return nil, ErrRowGroupSortingColumnsMismatch
   211  		}
   212  	}
   213  
   214  	m.sortFuncs = make([]columnSortFunc, len(m.sorting))
   215  	forEachLeafColumnOf(schema, func(leaf leafColumn) {
   216  		if sortingIndex := searchSortingColumn(m.sorting, leaf.path); sortingIndex < len(m.sorting) {
   217  			m.sortFuncs[sortingIndex] = columnSortFunc{
   218  				columnIndex: leaf.columnIndex,
   219  				compare: sortFuncOf(
   220  					leaf.node.Type(),
   221  					&SortConfig{
   222  						MaxRepetitionLevel: int(leaf.maxRepetitionLevel),
   223  						MaxDefinitionLevel: int(leaf.maxDefinitionLevel),
   224  						Descending:         m.sorting[sortingIndex].Descending(),
   225  						NullsFirst:         m.sorting[sortingIndex].NullsFirst(),
   226  					},
   227  				),
   228  			}
   229  		}
   230  	})
   231  
   232  	return m, nil
   233  }
   234  
   235  type rowGroup struct {
   236  	schema  *Schema
   237  	numRows int64
   238  	columns []ColumnChunk
   239  	sorting []SortingColumn
   240  }
   241  
   242  func (r *rowGroup) NumRows() int64                  { return r.numRows }
   243  func (r *rowGroup) ColumnChunks() []ColumnChunk     { return r.columns }
   244  func (r *rowGroup) SortingColumns() []SortingColumn { return r.sorting }
   245  func (r *rowGroup) Schema() *Schema                 { return r.schema }
   246  func (r *rowGroup) Rows() Rows                      { return &rowGroupRows{rowGroup: r} }
   247  
   248  func NewRowGroupRowReader(rowGroup RowGroup) Rows {
   249  	return &rowGroupRows{rowGroup: rowGroup}
   250  }
   251  
   252  type rowGroupRows struct {
   253  	rowGroup RowGroup
   254  	columns  []columnChunkReader
   255  	seek     int64
   256  	inited   bool
   257  	closed   bool
   258  }
   259  
   260  func (r *rowGroupRows) init() {
   261  	const columnBufferSize = defaultValueBufferSize
   262  	columns := r.rowGroup.ColumnChunks()
   263  	buffer := make([]Value, columnBufferSize*len(columns))
   264  	r.columns = make([]columnChunkReader, len(columns))
   265  
   266  	for i, column := range columns {
   267  		r.columns[i].buffer = buffer[:0:columnBufferSize]
   268  		r.columns[i].reader = column.Pages()
   269  		buffer = buffer[columnBufferSize:]
   270  	}
   271  
   272  	r.inited = true
   273  }
   274  
   275  func (r *rowGroupRows) Reset() {
   276  	for i := range r.columns {
   277  		// Ignore errors because we are resetting the reader, if the error
   278  		// persists we will see it on the next read, and otherwise we can
   279  		// read back from the beginning.
   280  		r.columns[i].seekToRow(0)
   281  	}
   282  	r.seek = 0
   283  }
   284  
   285  func (r *rowGroupRows) Close() error {
   286  	var lastErr error
   287  
   288  	for i := range r.columns {
   289  		if err := r.columns[i].close(); err != nil {
   290  			lastErr = err
   291  		}
   292  	}
   293  
   294  	r.inited = true
   295  	r.closed = true
   296  	return lastErr
   297  }
   298  
   299  func (r *rowGroupRows) Schema() *Schema {
   300  	return r.rowGroup.Schema()
   301  }
   302  
   303  func (r *rowGroupRows) SeekToRow(rowIndex int64) error {
   304  	if r.closed {
   305  		return io.ErrClosedPipe
   306  	}
   307  
   308  	for i := range r.columns {
   309  		if err := r.columns[i].seekToRow(rowIndex); err != nil {
   310  			return err
   311  		}
   312  	}
   313  
   314  	r.seek = rowIndex
   315  	return nil
   316  }
   317  
   318  func (r *rowGroupRows) ReadRows(rows []Row) (int, error) {
   319  	if !r.inited {
   320  		r.init()
   321  		if r.seek > 0 {
   322  			if err := r.SeekToRow(r.seek); err != nil {
   323  				return 0, err
   324  			}
   325  		}
   326  	}
   327  
   328  	if r.closed {
   329  		return 0, io.EOF
   330  	}
   331  
   332  	for i := range rows {
   333  		rows[i] = rows[i][:0]
   334  	}
   335  
   336  	return r.rowGroup.Schema().readRows(rows, 0, r.columns)
   337  }
   338  
   339  /*
   340  func (r *rowGroupRows) WriteRowsTo(w RowWriter) (int64, error) {
   341  	if r.rowGroup == nil {
   342  		return CopyRows(w, struct{ RowReaderWithSchema }{r})
   343  	}
   344  	defer func() { r.rowGroup, r.seek = nil, 0 }()
   345  	rowGroup := r.rowGroup
   346  	if r.seek > 0 {
   347  		columns := rowGroup.ColumnChunks()
   348  		seekRowGroup := &seekRowGroup{
   349  			base:    rowGroup,
   350  			seek:    r.seek,
   351  			columns: make([]ColumnChunk, len(columns)),
   352  		}
   353  		seekColumnChunks := make([]seekColumnChunk, len(columns))
   354  		for i := range seekColumnChunks {
   355  			seekColumnChunks[i].base = columns[i]
   356  			seekColumnChunks[i].seek = r.seek
   357  			seekRowGroup.columns[i] = &seekColumnChunks[i]
   358  		}
   359  		rowGroup = seekRowGroup
   360  	}
   361  
   362  	switch dst := w.(type) {
   363  	case RowGroupWriter:
   364  		return dst.WriteRowGroup(rowGroup)
   365  
   366  	case PageWriter:
   367  		for _, column := range rowGroup.ColumnChunks() {
   368  			_, err := copyPagesAndClose(dst, column.Pages())
   369  			if err != nil {
   370  				return 0, err
   371  			}
   372  		}
   373  		return rowGroup.NumRows(), nil
   374  	}
   375  
   376  	return CopyRows(w, struct{ RowReaderWithSchema }{r})
   377  }
   378  
   379  func (r *rowGroupRows) writeRowsTo(w pageAndValueWriter, limit int64) (numRows int64, err error) {
   380  	for i := range r.columns {
   381  		n, err := r.columns[i].writeRowsTo(w, limit)
   382  		if err != nil {
   383  			return numRows, err
   384  		}
   385  		if i == 0 {
   386  			numRows = n
   387  		} else if numRows != n {
   388  			return numRows, fmt.Errorf("column %d wrote %d rows but the previous column(s) wrote %d rows", i, n, numRows)
   389  		}
   390  	}
   391  	return numRows, nil
   392  }
   393  */
   394  
   395  type seekRowGroup struct {
   396  	base    RowGroup
   397  	seek    int64
   398  	columns []ColumnChunk
   399  }
   400  
   401  func (g *seekRowGroup) NumRows() int64 {
   402  	return g.base.NumRows() - g.seek
   403  }
   404  
   405  func (g *seekRowGroup) ColumnChunks() []ColumnChunk {
   406  	return g.columns
   407  }
   408  
   409  func (g *seekRowGroup) Schema() *Schema {
   410  	return g.base.Schema()
   411  }
   412  
   413  func (g *seekRowGroup) SortingColumns() []SortingColumn {
   414  	return g.base.SortingColumns()
   415  }
   416  
   417  func (g *seekRowGroup) Rows() Rows {
   418  	rows := g.base.Rows()
   419  	rows.SeekToRow(g.seek)
   420  	return rows
   421  }
   422  
   423  type seekColumnChunk struct {
   424  	base ColumnChunk
   425  	seek int64
   426  }
   427  
   428  func (c *seekColumnChunk) Type() Type {
   429  	return c.base.Type()
   430  }
   431  
   432  func (c *seekColumnChunk) Column() int {
   433  	return c.base.Column()
   434  }
   435  
   436  func (c *seekColumnChunk) Pages() Pages {
   437  	pages := c.base.Pages()
   438  	pages.SeekToRow(c.seek)
   439  	return pages
   440  }
   441  
   442  func (c *seekColumnChunk) ColumnIndex() ColumnIndex {
   443  	return c.base.ColumnIndex()
   444  }
   445  
   446  func (c *seekColumnChunk) OffsetIndex() OffsetIndex {
   447  	return c.base.OffsetIndex()
   448  }
   449  
   450  func (c *seekColumnChunk) BloomFilter() BloomFilter {
   451  	return c.base.BloomFilter()
   452  }
   453  
   454  func (c *seekColumnChunk) NumValues() int64 {
   455  	return c.base.NumValues()
   456  }
   457  
   458  type emptyRowGroup struct {
   459  	schema  *Schema
   460  	columns []ColumnChunk
   461  }
   462  
   463  func newEmptyRowGroup(schema *Schema) *emptyRowGroup {
   464  	columns := schema.Columns()
   465  	rowGroup := &emptyRowGroup{
   466  		schema:  schema,
   467  		columns: make([]ColumnChunk, len(columns)),
   468  	}
   469  	emptyColumnChunks := make([]emptyColumnChunk, len(columns))
   470  	for i, column := range schema.Columns() {
   471  		leaf, _ := schema.Lookup(column...)
   472  		emptyColumnChunks[i].typ = leaf.Node.Type()
   473  		emptyColumnChunks[i].column = int16(leaf.ColumnIndex)
   474  		rowGroup.columns[i] = &emptyColumnChunks[i]
   475  	}
   476  	return rowGroup
   477  }
   478  
   479  func (g *emptyRowGroup) NumRows() int64                  { return 0 }
   480  func (g *emptyRowGroup) ColumnChunks() []ColumnChunk     { return g.columns }
   481  func (g *emptyRowGroup) Schema() *Schema                 { return g.schema }
   482  func (g *emptyRowGroup) SortingColumns() []SortingColumn { return nil }
   483  func (g *emptyRowGroup) Rows() Rows                      { return emptyRows{g.schema} }
   484  
   485  type emptyColumnChunk struct {
   486  	typ    Type
   487  	column int16
   488  }
   489  
   490  func (c *emptyColumnChunk) Type() Type               { return c.typ }
   491  func (c *emptyColumnChunk) Column() int              { return int(c.column) }
   492  func (c *emptyColumnChunk) Pages() Pages             { return emptyPages{} }
   493  func (c *emptyColumnChunk) ColumnIndex() ColumnIndex { return emptyColumnIndex{} }
   494  func (c *emptyColumnChunk) OffsetIndex() OffsetIndex { return emptyOffsetIndex{} }
   495  func (c *emptyColumnChunk) BloomFilter() BloomFilter { return emptyBloomFilter{} }
   496  func (c *emptyColumnChunk) NumValues() int64         { return 0 }
   497  
   498  type emptyBloomFilter struct{}
   499  
   500  func (emptyBloomFilter) ReadAt([]byte, int64) (int, error) { return 0, io.EOF }
   501  func (emptyBloomFilter) Size() int64                       { return 0 }
   502  func (emptyBloomFilter) Check(Value) (bool, error)         { return false, nil }
   503  
   504  type emptyRows struct{ schema *Schema }
   505  
   506  func (r emptyRows) Close() error                         { return nil }
   507  func (r emptyRows) Schema() *Schema                      { return r.schema }
   508  func (r emptyRows) ReadRows([]Row) (int, error)          { return 0, io.EOF }
   509  func (r emptyRows) SeekToRow(int64) error                { return nil }
   510  func (r emptyRows) WriteRowsTo(RowWriter) (int64, error) { return 0, nil }
   511  
   512  type emptyPages struct{}
   513  
   514  func (emptyPages) ReadPage() (Page, error) { return nil, io.EOF }
   515  func (emptyPages) SeekToRow(int64) error   { return nil }
   516  func (emptyPages) Close() error            { return nil }
   517  
   518  var (
   519  	_ RowReaderWithSchema = (*rowGroupRows)(nil)
   520  	//_ RowWriterTo         = (*rowGroupRows)(nil)
   521  
   522  	_ RowReaderWithSchema = emptyRows{}
   523  	_ RowWriterTo         = emptyRows{}
   524  )