github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/row_buffer.go

github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/row_buffer.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  	"sort"
     6  
     7  	"github.com/parquet-go/parquet-go/deprecated"
     8  	"github.com/parquet-go/parquet-go/encoding"
     9  )
    10  
    11  // RowBuffer is an implementation of the RowGroup interface which stores parquet
    12  // rows in memory.
    13  //
    14  // Unlike GenericBuffer which uses a column layout to store values in memory
    15  // buffers, RowBuffer uses a row layout. The use of row layout provides greater
    16  // efficiency when sorting the buffer, which is the primary use case for the
    17  // RowBuffer type. Applications which intend to sort rows prior to writing them
    18  // to a parquet file will often see lower CPU utilization from using a RowBuffer
    19  // than a GenericBuffer.
    20  //
    21  // RowBuffer values are not safe to use concurrently from multiple goroutines.
    22  type RowBuffer[T any] struct {
    23  	alloc   rowAllocator
    24  	schema  *Schema
    25  	sorting []SortingColumn
    26  	rows    []Row
    27  	values  []Value
    28  	compare func(Row, Row) int
    29  }
    30  
    31  // NewRowBuffer constructs a new row buffer.
    32  func NewRowBuffer[T any](options ...RowGroupOption) *RowBuffer[T] {
    33  	config := DefaultRowGroupConfig()
    34  	config.Apply(options...)
    35  	if err := config.Validate(); err != nil {
    36  		panic(err)
    37  	}
    38  
    39  	t := typeOf[T]()
    40  	if config.Schema == nil && t != nil {
    41  		config.Schema = schemaOf(dereference(t))
    42  	}
    43  
    44  	if config.Schema == nil {
    45  		panic("row buffer must be instantiated with schema or concrete type.")
    46  	}
    47  
    48  	return &RowBuffer[T]{
    49  		schema:  config.Schema,
    50  		sorting: config.Sorting.SortingColumns,
    51  		compare: config.Schema.Comparator(config.Sorting.SortingColumns...),
    52  	}
    53  }
    54  
    55  // Reset clears the content of the buffer without releasing its memory.
    56  func (buf *RowBuffer[T]) Reset() {
    57  	for i := range buf.rows {
    58  		buf.rows[i] = nil
    59  	}
    60  	for i := range buf.values {
    61  		buf.values[i] = Value{}
    62  	}
    63  	buf.rows = buf.rows[:0]
    64  	buf.values = buf.values[:0]
    65  	buf.alloc.reset()
    66  }
    67  
    68  // NumRows returns the number of rows currently written to the buffer.
    69  func (buf *RowBuffer[T]) NumRows() int64 { return int64(len(buf.rows)) }
    70  
    71  // ColumnChunks returns a view of the buffer's columns.
    72  //
    73  // Note that reading columns of a RowBuffer will be less efficient than reading
    74  // columns of a GenericBuffer since the latter uses a column layout. This method
    75  // is mainly exposed to satisfy the RowGroup interface, applications which need
    76  // compute-efficient column scans on in-memory buffers should likely use a
    77  // GenericBuffer instead.
    78  //
    79  // The returned column chunks are snapshots at the time the method is called,
    80  // they remain valid until the next call to Reset on the buffer.
    81  func (buf *RowBuffer[T]) ColumnChunks() []ColumnChunk {
    82  	columns := buf.schema.Columns()
    83  	chunks := make([]rowBufferColumnChunk, len(columns))
    84  
    85  	for i, column := range columns {
    86  		leafColumn, _ := buf.schema.Lookup(column...)
    87  		chunks[i] = rowBufferColumnChunk{
    88  			page: rowBufferPage{
    89  				rows:               buf.rows,
    90  				typ:                leafColumn.Node.Type(),
    91  				column:             leafColumn.ColumnIndex,
    92  				maxRepetitionLevel: byte(leafColumn.MaxRepetitionLevel),
    93  				maxDefinitionLevel: byte(leafColumn.MaxDefinitionLevel),
    94  			},
    95  		}
    96  	}
    97  
    98  	columnChunks := make([]ColumnChunk, len(chunks))
    99  	for i := range chunks {
   100  		columnChunks[i] = &chunks[i]
   101  	}
   102  	return columnChunks
   103  }
   104  
   105  // SortingColumns returns the list of columns that rows are expected to be
   106  // sorted by.
   107  //
   108  // The list of sorting columns is configured when the buffer is created and used
   109  // when it is sorted.
   110  //
   111  // Note that unless the buffer is explicitly sorted, there are no guarantees
   112  // that the rows it contains will be in the order specified by the sorting
   113  // columns.
   114  func (buf *RowBuffer[T]) SortingColumns() []SortingColumn { return buf.sorting }
   115  
   116  // Schema returns the schema of rows in the buffer.
   117  func (buf *RowBuffer[T]) Schema() *Schema { return buf.schema }
   118  
   119  // Len returns the number of rows in the buffer.
   120  //
   121  // The method contributes to satisfying sort.Interface.
   122  func (buf *RowBuffer[T]) Len() int { return len(buf.rows) }
   123  
   124  // Less compares the rows at index i and j according to the sorting columns
   125  // configured on the buffer.
   126  //
   127  // The method contributes to satisfying sort.Interface.
   128  func (buf *RowBuffer[T]) Less(i, j int) bool {
   129  	return buf.compare(buf.rows[i], buf.rows[j]) < 0
   130  }
   131  
   132  // Swap exchanges the rows at index i and j in the buffer.
   133  //
   134  // The method contributes to satisfying sort.Interface.
   135  func (buf *RowBuffer[T]) Swap(i, j int) {
   136  	buf.rows[i], buf.rows[j] = buf.rows[j], buf.rows[i]
   137  }
   138  
   139  // Rows returns a Rows instance exposing rows stored in the buffer.
   140  //
   141  // The rows returned are a snapshot at the time the method is called.
   142  // The returned rows and values read from it remain valid until the next call
   143  // to Reset on the buffer.
   144  func (buf *RowBuffer[T]) Rows() Rows {
   145  	return &rowBufferRows{rows: buf.rows, schema: buf.schema}
   146  }
   147  
   148  // Write writes rows to the buffer, returning the number of rows written.
   149  func (buf *RowBuffer[T]) Write(rows []T) (int, error) {
   150  	for i := range rows {
   151  		off := len(buf.values)
   152  		buf.values = buf.schema.Deconstruct(buf.values, &rows[i])
   153  		end := len(buf.values)
   154  		row := buf.values[off:end:end]
   155  		buf.alloc.capture(row)
   156  		buf.rows = append(buf.rows, row)
   157  	}
   158  	return len(rows), nil
   159  }
   160  
   161  // WriteRows writes parquet rows to the buffer, returing the number of rows
   162  // written.
   163  func (buf *RowBuffer[T]) WriteRows(rows []Row) (int, error) {
   164  	for i := range rows {
   165  		off := len(buf.values)
   166  		buf.values = append(buf.values, rows[i]...)
   167  		end := len(buf.values)
   168  		row := buf.values[off:end:end]
   169  		buf.alloc.capture(row)
   170  		buf.rows = append(buf.rows, row)
   171  	}
   172  	return len(rows), nil
   173  }
   174  
   175  type rowBufferColumnChunk struct{ page rowBufferPage }
   176  
   177  func (c *rowBufferColumnChunk) Type() Type { return c.page.Type() }
   178  
   179  func (c *rowBufferColumnChunk) Column() int { return c.page.Column() }
   180  
   181  func (c *rowBufferColumnChunk) Pages() Pages { return onePage(&c.page) }
   182  
   183  func (c *rowBufferColumnChunk) ColumnIndex() (ColumnIndex, error) { return nil, nil }
   184  
   185  func (c *rowBufferColumnChunk) OffsetIndex() (OffsetIndex, error) { return nil, nil }
   186  
   187  func (c *rowBufferColumnChunk) BloomFilter() BloomFilter { return nil }
   188  
   189  func (c *rowBufferColumnChunk) NumValues() int64 { return c.page.NumValues() }
   190  
   191  type rowBufferPage struct {
   192  	rows               []Row
   193  	typ                Type
   194  	column             int
   195  	maxRepetitionLevel byte
   196  	maxDefinitionLevel byte
   197  }
   198  
   199  func (p *rowBufferPage) Type() Type { return p.typ }
   200  
   201  func (p *rowBufferPage) Column() int { return p.column }
   202  
   203  func (p *rowBufferPage) Dictionary() Dictionary { return nil }
   204  
   205  func (p *rowBufferPage) NumRows() int64 { return int64(len(p.rows)) }
   206  
   207  func (p *rowBufferPage) NumValues() int64 {
   208  	numValues := int64(0)
   209  	p.scan(func(value Value) {
   210  		if !value.isNull() {
   211  			numValues++
   212  		}
   213  	})
   214  	return numValues
   215  }
   216  
   217  func (p *rowBufferPage) NumNulls() int64 {
   218  	numNulls := int64(0)
   219  	p.scan(func(value Value) {
   220  		if value.isNull() {
   221  			numNulls++
   222  		}
   223  	})
   224  	return numNulls
   225  }
   226  
   227  func (p *rowBufferPage) Bounds() (min, max Value, ok bool) {
   228  	p.scan(func(value Value) {
   229  		if !value.IsNull() {
   230  			switch {
   231  			case !ok:
   232  				min, max, ok = value, value, true
   233  			case p.typ.Compare(value, min) < 0:
   234  				min = value
   235  			case p.typ.Compare(value, max) > 0:
   236  				max = value
   237  			}
   238  		}
   239  	})
   240  	return min, max, ok
   241  }
   242  
   243  func (p *rowBufferPage) Size() int64 { return 0 }
   244  
   245  func (p *rowBufferPage) Values() ValueReader {
   246  	return &rowBufferPageValueReader{
   247  		page:        p,
   248  		columnIndex: ^int16(p.column),
   249  	}
   250  }
   251  
   252  func (p *rowBufferPage) Clone() Page {
   253  	rows := make([]Row, len(p.rows))
   254  	for i := range rows {
   255  		rows[i] = p.rows[i].Clone()
   256  	}
   257  	return &rowBufferPage{
   258  		rows:   rows,
   259  		typ:    p.typ,
   260  		column: p.column,
   261  	}
   262  }
   263  
   264  func (p *rowBufferPage) Slice(i, j int64) Page {
   265  	return &rowBufferPage{
   266  		rows:   p.rows[i:j],
   267  		typ:    p.typ,
   268  		column: p.column,
   269  	}
   270  }
   271  
   272  func (p *rowBufferPage) RepetitionLevels() (repetitionLevels []byte) {
   273  	if p.maxRepetitionLevel != 0 {
   274  		repetitionLevels = make([]byte, 0, len(p.rows))
   275  		p.scan(func(value Value) {
   276  			repetitionLevels = append(repetitionLevels, value.repetitionLevel)
   277  		})
   278  	}
   279  	return repetitionLevels
   280  }
   281  
   282  func (p *rowBufferPage) DefinitionLevels() (definitionLevels []byte) {
   283  	if p.maxDefinitionLevel != 0 {
   284  		definitionLevels = make([]byte, 0, len(p.rows))
   285  		p.scan(func(value Value) {
   286  			definitionLevels = append(definitionLevels, value.definitionLevel)
   287  		})
   288  	}
   289  	return definitionLevels
   290  }
   291  
   292  func (p *rowBufferPage) Data() encoding.Values {
   293  	switch p.typ.Kind() {
   294  	case Boolean:
   295  		values := make([]byte, (len(p.rows)+7)/8)
   296  		numValues := 0
   297  		p.scanNonNull(func(value Value) {
   298  			if value.boolean() {
   299  				i := uint(numValues) / 8
   300  				j := uint(numValues) % 8
   301  				values[i] |= 1 << j
   302  			}
   303  			numValues++
   304  		})
   305  		return encoding.BooleanValues(values[:(numValues+7)/8])
   306  
   307  	case Int32:
   308  		values := make([]int32, 0, len(p.rows))
   309  		p.scanNonNull(func(value Value) { values = append(values, value.int32()) })
   310  		return encoding.Int32Values(values)
   311  
   312  	case Int64:
   313  		values := make([]int64, 0, len(p.rows))
   314  		p.scanNonNull(func(value Value) { values = append(values, value.int64()) })
   315  		return encoding.Int64Values(values)
   316  
   317  	case Int96:
   318  		values := make([]deprecated.Int96, 0, len(p.rows))
   319  		p.scanNonNull(func(value Value) { values = append(values, value.int96()) })
   320  		return encoding.Int96Values(values)
   321  
   322  	case Float:
   323  		values := make([]float32, 0, len(p.rows))
   324  		p.scanNonNull(func(value Value) { values = append(values, value.float()) })
   325  		return encoding.FloatValues(values)
   326  
   327  	case Double:
   328  		values := make([]float64, 0, len(p.rows))
   329  		p.scanNonNull(func(value Value) { values = append(values, value.double()) })
   330  		return encoding.DoubleValues(values)
   331  
   332  	case ByteArray:
   333  		values := make([]byte, 0, p.typ.EstimateSize(len(p.rows)))
   334  		offsets := make([]uint32, 0, len(p.rows))
   335  		p.scanNonNull(func(value Value) {
   336  			offsets = append(offsets, uint32(len(values)))
   337  			values = append(values, value.byteArray()...)
   338  		})
   339  		offsets = append(offsets, uint32(len(values)))
   340  		return encoding.ByteArrayValues(values, offsets)
   341  
   342  	case FixedLenByteArray:
   343  		length := p.typ.Length()
   344  		values := make([]byte, 0, length*len(p.rows))
   345  		p.scanNonNull(func(value Value) { values = append(values, value.byteArray()...) })
   346  		return encoding.FixedLenByteArrayValues(values, length)
   347  
   348  	default:
   349  		return encoding.Values{}
   350  	}
   351  }
   352  
   353  func (p *rowBufferPage) scan(f func(Value)) {
   354  	columnIndex := ^int16(p.column)
   355  
   356  	for _, row := range p.rows {
   357  		for _, value := range row {
   358  			if value.columnIndex == columnIndex {
   359  				f(value)
   360  			}
   361  		}
   362  	}
   363  }
   364  
   365  func (p *rowBufferPage) scanNonNull(f func(Value)) {
   366  	p.scan(func(value Value) {
   367  		if !value.isNull() {
   368  			f(value)
   369  		}
   370  	})
   371  }
   372  
   373  type rowBufferPageValueReader struct {
   374  	page        *rowBufferPage
   375  	rowIndex    int
   376  	valueIndex  int
   377  	columnIndex int16
   378  }
   379  
   380  func (r *rowBufferPageValueReader) ReadValues(values []Value) (n int, err error) {
   381  	for n < len(values) && r.rowIndex < len(r.page.rows) {
   382  		for n < len(values) && r.valueIndex < len(r.page.rows[r.rowIndex]) {
   383  			if v := r.page.rows[r.rowIndex][r.valueIndex]; v.columnIndex == r.columnIndex {
   384  				values[n] = v
   385  				n++
   386  			}
   387  			r.valueIndex++
   388  		}
   389  		r.rowIndex++
   390  		r.valueIndex = 0
   391  	}
   392  	if r.rowIndex == len(r.page.rows) {
   393  		err = io.EOF
   394  	}
   395  	return n, err
   396  }
   397  
   398  type rowBufferRows struct {
   399  	rows   []Row
   400  	index  int
   401  	schema *Schema
   402  }
   403  
   404  func (r *rowBufferRows) Close() error {
   405  	r.index = -1
   406  	return nil
   407  }
   408  
   409  func (r *rowBufferRows) Schema() *Schema {
   410  	return r.schema
   411  }
   412  
   413  func (r *rowBufferRows) SeekToRow(rowIndex int64) error {
   414  	if rowIndex < 0 {
   415  		return ErrSeekOutOfRange
   416  	}
   417  
   418  	if r.index < 0 {
   419  		return io.ErrClosedPipe
   420  	}
   421  
   422  	maxRowIndex := int64(len(r.rows))
   423  	if rowIndex > maxRowIndex {
   424  		rowIndex = maxRowIndex
   425  	}
   426  
   427  	r.index = int(rowIndex)
   428  	return nil
   429  }
   430  
   431  func (r *rowBufferRows) ReadRows(rows []Row) (n int, err error) {
   432  	if r.index < 0 {
   433  		return 0, io.EOF
   434  	}
   435  
   436  	if n = len(r.rows) - r.index; n > len(rows) {
   437  		n = len(rows)
   438  	}
   439  
   440  	for i, row := range r.rows[r.index : r.index+n] {
   441  		rows[i] = append(rows[i][:0], row...)
   442  	}
   443  
   444  	if r.index += n; r.index == len(r.rows) {
   445  		err = io.EOF
   446  	}
   447  
   448  	return n, err
   449  }
   450  
   451  func (r *rowBufferRows) WriteRowsTo(w RowWriter) (int64, error) {
   452  	n, err := w.WriteRows(r.rows[r.index:])
   453  	r.index += n
   454  	return int64(n), err
   455  }
   456  
   457  var (
   458  	_ RowGroup       = (*RowBuffer[any])(nil)
   459  	_ RowWriter      = (*RowBuffer[any])(nil)
   460  	_ sort.Interface = (*RowBuffer[any])(nil)
   461  
   462  	_ RowWriterTo = (*rowBufferRows)(nil)
   463  )