github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/buffer.go

github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/buffer.go (about)

     1  package parquet
     2  
     3  import (
     4  	"sort"
     5  )
     6  
     7  // Buffer represents an in-memory group of parquet rows.
     8  //
     9  // The main purpose of the Buffer type is to provide a way to sort rows before
    10  // writing them to a parquet file. Buffer implements sort.Interface as a way
    11  // to support reordering the rows that have been written to it.
    12  type Buffer struct {
    13  	config  *RowGroupConfig
    14  	schema  *Schema
    15  	rowbuf  []Row
    16  	colbuf  [][]Value
    17  	chunks  []ColumnChunk
    18  	columns []ColumnBuffer
    19  	sorted  []ColumnBuffer
    20  }
    21  
    22  // NewBuffer constructs a new buffer, using the given list of buffer options
    23  // to configure the buffer returned by the function.
    24  //
    25  // The function panics if the buffer configuration is invalid. Programs that
    26  // cannot guarantee the validity of the options passed to NewBuffer should
    27  // construct the buffer configuration independently prior to calling this
    28  // function:
    29  //
    30  //	config, err := parquet.NewRowGroupConfig(options...)
    31  //	if err != nil {
    32  //		// handle the configuration error
    33  //		...
    34  //	} else {
    35  //		// this call to create a buffer is guaranteed not to panic
    36  //		buffer := parquet.NewBuffer(config)
    37  //		...
    38  //	}
    39  //
    40  func NewBuffer(options ...RowGroupOption) *Buffer {
    41  	config, err := NewRowGroupConfig(options...)
    42  	if err != nil {
    43  		panic(err)
    44  	}
    45  	buf := &Buffer{
    46  		config: config,
    47  	}
    48  	if config.Schema != nil {
    49  		buf.configure(config.Schema)
    50  	}
    51  	return buf
    52  }
    53  
    54  func (buf *Buffer) configure(schema *Schema) {
    55  	if schema == nil {
    56  		return
    57  	}
    58  	sortingColumns := buf.config.SortingColumns
    59  	buf.sorted = make([]ColumnBuffer, len(sortingColumns))
    60  
    61  	forEachLeafColumnOf(schema, func(leaf leafColumn) {
    62  		nullOrdering := nullsGoLast
    63  		columnIndex := int(leaf.columnIndex)
    64  		columnType := leaf.node.Type()
    65  		bufferCap := buf.config.ColumnBufferCapacity
    66  		dictionary := (Dictionary)(nil)
    67  		encoding := encodingOf(leaf.node)
    68  
    69  		if isDictionaryEncoding(encoding) {
    70  			dictBuffer := make([]byte, 0, columnType.EstimateSize(bufferCap))
    71  			dictionary = columnType.NewDictionary(columnIndex, 0, dictBuffer)
    72  			columnType = dictionary.Type()
    73  		}
    74  
    75  		column := columnType.NewColumnBuffer(columnIndex, bufferCap)
    76  		switch {
    77  		case leaf.maxRepetitionLevel > 0:
    78  			column = newRepeatedColumnBuffer(column, leaf.maxRepetitionLevel, leaf.maxDefinitionLevel, nullOrdering)
    79  		case leaf.maxDefinitionLevel > 0:
    80  			column = newOptionalColumnBuffer(column, leaf.maxDefinitionLevel, nullOrdering)
    81  		}
    82  		buf.columns = append(buf.columns, column)
    83  
    84  		if sortingIndex := searchSortingColumn(sortingColumns, leaf.path); sortingIndex < len(sortingColumns) {
    85  			if sortingColumns[sortingIndex].Descending() {
    86  				column = &reversedColumnBuffer{column}
    87  			}
    88  			if sortingColumns[sortingIndex].NullsFirst() {
    89  				nullOrdering = nullsGoFirst
    90  			}
    91  			buf.sorted[sortingIndex] = column
    92  		}
    93  	})
    94  
    95  	buf.schema = schema
    96  	buf.rowbuf = make([]Row, 0, 1)
    97  	buf.colbuf = make([][]Value, len(buf.columns))
    98  	buf.chunks = make([]ColumnChunk, len(buf.columns))
    99  
   100  	for i, column := range buf.columns {
   101  		buf.chunks[i] = column
   102  	}
   103  }
   104  
   105  // Size returns the estimated size of the buffer in memory (in bytes).
   106  func (buf *Buffer) Size() int64 {
   107  	size := int64(0)
   108  	for _, col := range buf.columns {
   109  		size += col.Size()
   110  	}
   111  	return size
   112  }
   113  
   114  // NumRows returns the number of rows written to the buffer.
   115  func (buf *Buffer) NumRows() int64 { return int64(buf.Len()) }
   116  
   117  // ColumnChunks returns the buffer columns.
   118  func (buf *Buffer) ColumnChunks() []ColumnChunk { return buf.chunks }
   119  
   120  // ColumnBuffer returns the buffer columns.
   121  //
   122  // This method is similar to ColumnChunks, but returns a list of ColumnBuffer
   123  // instead of a ColumnChunk values (the latter being read-only); calling
   124  // ColumnBuffers or ColumnChunks with the same index returns the same underlying
   125  // objects, but with different types, which removes the need for making a type
   126  // assertion if the program needed to write directly to the column buffers.
   127  // The presence of the ColumnChunks method is still required to satisfy the
   128  // RowGroup interface.
   129  func (buf *Buffer) ColumnBuffers() []ColumnBuffer { return buf.columns }
   130  
   131  // Schema returns the schema of the buffer.
   132  //
   133  // The schema is either configured by passing a Schema in the option list when
   134  // constructing the buffer, or lazily discovered when the first row is written.
   135  func (buf *Buffer) Schema() *Schema { return buf.schema }
   136  
   137  // SortingColumns returns the list of columns by which the buffer will be
   138  // sorted.
   139  //
   140  // The sorting order is configured by passing a SortingColumns option when
   141  // constructing the buffer.
   142  func (buf *Buffer) SortingColumns() []SortingColumn { return buf.config.SortingColumns }
   143  
   144  // Len returns the number of rows written to the buffer.
   145  func (buf *Buffer) Len() int {
   146  	if len(buf.columns) == 0 {
   147  		return 0
   148  	} else {
   149  		// All columns have the same number of rows.
   150  		return buf.columns[0].Len()
   151  	}
   152  }
   153  
   154  // Less returns true if row[i] < row[j] in the buffer.
   155  func (buf *Buffer) Less(i, j int) bool {
   156  	for _, col := range buf.sorted {
   157  		switch {
   158  		case col.Less(i, j):
   159  			return true
   160  		case col.Less(j, i):
   161  			return false
   162  		}
   163  	}
   164  	return false
   165  }
   166  
   167  // Swap exchanges the rows at indexes i and j.
   168  func (buf *Buffer) Swap(i, j int) {
   169  	for _, col := range buf.columns {
   170  		col.Swap(i, j)
   171  	}
   172  }
   173  
   174  // Reset clears the content of the buffer, allowing it to be reused.
   175  func (buf *Buffer) Reset() {
   176  	for _, col := range buf.columns {
   177  		col.Reset()
   178  	}
   179  }
   180  
   181  // Write writes a row held in a Go value to the buffer.
   182  func (buf *Buffer) Write(row interface{}) error {
   183  	if buf.schema == nil {
   184  		buf.configure(SchemaOf(row))
   185  	}
   186  
   187  	buf.rowbuf = buf.rowbuf[:1]
   188  	defer clearRows(buf.rowbuf)
   189  
   190  	buf.rowbuf[0] = buf.schema.Deconstruct(buf.rowbuf[0], row)
   191  	_, err := buf.WriteRows(buf.rowbuf)
   192  	return err
   193  }
   194  
   195  // WriteRows writes parquet rows to the buffer.
   196  func (buf *Buffer) WriteRows(rows []Row) (int, error) {
   197  	defer func() {
   198  		for i, colbuf := range buf.colbuf {
   199  			clearValues(colbuf)
   200  			buf.colbuf[i] = colbuf[:0]
   201  		}
   202  	}()
   203  
   204  	if buf.schema == nil {
   205  		return 0, ErrRowGroupSchemaMissing
   206  	}
   207  
   208  	for _, row := range rows {
   209  		for _, value := range row {
   210  			columnIndex := value.Column()
   211  			buf.colbuf[columnIndex] = append(buf.colbuf[columnIndex], value)
   212  		}
   213  	}
   214  
   215  	for columnIndex, values := range buf.colbuf {
   216  		if _, err := buf.columns[columnIndex].WriteValues(values); err != nil {
   217  			// TOOD: an error at this stage will leave the buffer in an invalid
   218  			// state since the row was partially written. Applications are not
   219  			// expected to continue using the buffer after getting an error,
   220  			// maybe we can enforce it?
   221  			return 0, err
   222  		}
   223  	}
   224  
   225  	return len(rows), nil
   226  }
   227  
   228  // WriteRowGroup satisfies the RowGroupWriter interface.
   229  func (buf *Buffer) WriteRowGroup(rowGroup RowGroup) (int64, error) {
   230  	rowGroupSchema := rowGroup.Schema()
   231  	switch {
   232  	case rowGroupSchema == nil:
   233  		return 0, ErrRowGroupSchemaMissing
   234  	case buf.schema == nil:
   235  		buf.configure(rowGroupSchema)
   236  	case !nodesAreEqual(buf.schema, rowGroupSchema):
   237  		return 0, ErrRowGroupSchemaMismatch
   238  	}
   239  	if !sortingColumnsHavePrefix(rowGroup.SortingColumns(), buf.SortingColumns()) {
   240  		return 0, ErrRowGroupSortingColumnsMismatch
   241  	}
   242  	n := buf.NumRows()
   243  	r := rowGroup.Rows()
   244  	defer r.Close()
   245  	_, err := CopyRows(bufferWriter{buf}, r)
   246  	return buf.NumRows() - n, err
   247  }
   248  
   249  // Rows returns a reader exposing the current content of the buffer.
   250  //
   251  // The buffer and the returned reader share memory. Mutating the buffer
   252  // concurrently to reading rows may result in non-deterministic behavior.
   253  func (buf *Buffer) Rows() Rows { return &rowGroupRows{rowGroup: buf} }
   254  
   255  // bufferWriter is an adapter for Buffer which implements both RowWriter and
   256  // PageWriter to enable optimizations in CopyRows for types that support writing
   257  // rows by copying whole pages instead of calling WriteRow repeatedly.
   258  type bufferWriter struct{ buf *Buffer }
   259  
   260  func (w bufferWriter) WriteRows(rows []Row) (int, error) {
   261  	return w.buf.WriteRows(rows)
   262  }
   263  
   264  func (w bufferWriter) WriteValues(values []Value) (int, error) {
   265  	return w.buf.columns[values[0].Column()].WriteValues(values)
   266  }
   267  
   268  func (w bufferWriter) WritePage(page Page) (int64, error) {
   269  	return CopyValues(w.buf.columns[page.Column()], page.Values())
   270  }
   271  
   272  var (
   273  	_ RowGroup       = (*Buffer)(nil)
   274  	_ RowGroupWriter = (*Buffer)(nil)
   275  	_ sort.Interface = (*Buffer)(nil)
   276  
   277  	_ RowWriter   = (*bufferWriter)(nil)
   278  	_ PageWriter  = (*bufferWriter)(nil)
   279  	_ ValueWriter = (*bufferWriter)(nil)
   280  )