github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/convert.go

github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/convert.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"sync"
     7  )
     8  
     9  // ConvertError is an error type returned by calls to Convert when the conversion
    10  // of parquet schemas is impossible or the input row for the conversion is
    11  // malformed.
    12  type ConvertError struct {
    13  	Path []string
    14  	From Node
    15  	To   Node
    16  }
    17  
    18  // Error satisfies the error interface.
    19  func (e *ConvertError) Error() string {
    20  	sourceType := e.From.Type()
    21  	targetType := e.To.Type()
    22  
    23  	sourceRepetition := fieldRepetitionTypeOf(e.From)
    24  	targetRepetition := fieldRepetitionTypeOf(e.To)
    25  
    26  	return fmt.Sprintf("cannot convert parquet column %q from %s %s to %s %s",
    27  		columnPath(e.Path),
    28  		sourceRepetition,
    29  		sourceType,
    30  		targetRepetition,
    31  		targetType,
    32  	)
    33  }
    34  
    35  // Conversion is an interface implemented by types that provide conversion of
    36  // parquet rows from one schema to another.
    37  //
    38  // Conversion instances must be safe to use concurrently from multiple goroutines.
    39  type Conversion interface {
    40  	// Applies the conversion logic on the src row, returning the result
    41  	// appended to dst.
    42  	Convert(dst, src Row) (Row, error)
    43  	// Converts the given column index in the target schema to the original
    44  	// column index in the source schema of the conversion.
    45  	Column(int) int
    46  	// Returns the target schema of the conversion.
    47  	Schema() *Schema
    48  }
    49  
    50  type conversion struct {
    51  	targetColumnKinds   []Kind
    52  	targetToSourceIndex []int16
    53  	sourceToTargetIndex []int16
    54  	schema              *Schema
    55  	buffers             sync.Pool
    56  }
    57  
    58  type conversionBuffer struct {
    59  	columns [][]Value
    60  }
    61  
    62  func (c *conversion) getBuffer() *conversionBuffer {
    63  	b, _ := c.buffers.Get().(*conversionBuffer)
    64  	if b == nil {
    65  		n := len(c.targetColumnKinds)
    66  		columns, values := make([][]Value, n), make([]Value, n)
    67  		for i := range columns {
    68  			columns[i] = values[i : i : i+1]
    69  		}
    70  		b = &conversionBuffer{columns: columns}
    71  	}
    72  	return b
    73  }
    74  
    75  func (c *conversion) putBuffer(b *conversionBuffer) {
    76  	for i, values := range b.columns {
    77  		clearValues(values)
    78  		b.columns[i] = values[:0]
    79  	}
    80  	c.buffers.Put(b)
    81  }
    82  
    83  func (c *conversion) Convert(target, source Row) (Row, error) {
    84  	buffer := c.getBuffer()
    85  	defer c.putBuffer(buffer)
    86  
    87  	for _, value := range source {
    88  		sourceIndex := value.Column()
    89  		targetIndex := c.sourceToTargetIndex[sourceIndex]
    90  		if targetIndex >= 0 {
    91  			value.kind = ^int8(c.targetColumnKinds[targetIndex])
    92  			value.columnIndex = ^targetIndex
    93  			buffer.columns[targetIndex] = append(buffer.columns[targetIndex], value)
    94  		}
    95  	}
    96  
    97  	for i, values := range buffer.columns {
    98  		if len(values) == 0 {
    99  			values = append(values, Value{
   100  				kind:        ^int8(c.targetColumnKinds[i]),
   101  				columnIndex: ^int16(i),
   102  			})
   103  		}
   104  		target = append(target, values...)
   105  	}
   106  
   107  	return target, nil
   108  }
   109  
   110  func (c *conversion) Column(i int) int {
   111  	return int(c.targetToSourceIndex[i])
   112  }
   113  
   114  func (c *conversion) Schema() *Schema {
   115  	return c.schema
   116  }
   117  
   118  type identity struct{ schema *Schema }
   119  
   120  func (id identity) Convert(dst, src Row) (Row, error) { return append(dst, src...), nil }
   121  func (id identity) Column(i int) int                  { return i }
   122  func (id identity) Schema() *Schema                   { return id.schema }
   123  
   124  // Convert constructs a conversion function from one parquet schema to another.
   125  //
   126  // The function supports converting between schemas where the source or target
   127  // have extra columns; if there are more columns in the source, they will be
   128  // stripped out of the rows. Extra columns in the target schema will be set to
   129  // null or zero values.
   130  //
   131  // The returned function is intended to be used to append the converted source
   132  // row to the destination buffer.
   133  func Convert(to, from Node) (conv Conversion, err error) {
   134  	schema, _ := to.(*Schema)
   135  	if schema == nil {
   136  		schema = NewSchema("", to)
   137  	}
   138  
   139  	if nodesAreEqual(to, from) {
   140  		return identity{schema}, nil
   141  	}
   142  
   143  	targetMapping, targetColumns := columnMappingOf(to)
   144  	sourceMapping, sourceColumns := columnMappingOf(from)
   145  
   146  	columnIndexBuffer := make([]int16, len(targetColumns)+len(sourceColumns))
   147  	targetColumnKinds := make([]Kind, len(targetColumns))
   148  	targetToSourceIndex := columnIndexBuffer[:len(targetColumns)]
   149  	sourceToTargetIndex := columnIndexBuffer[len(targetColumns):]
   150  
   151  	for i, path := range targetColumns {
   152  		sourceColumn := sourceMapping.lookup(path)
   153  		targetColumn := targetMapping.lookup(path)
   154  		targetToSourceIndex[i] = sourceColumn.columnIndex
   155  		targetColumnKinds[i] = targetColumn.node.Type().Kind()
   156  	}
   157  
   158  	for i, path := range sourceColumns {
   159  		sourceColumn := sourceMapping.lookup(path)
   160  		targetColumn := targetMapping.lookup(path)
   161  
   162  		if targetColumn.node != nil {
   163  			sourceType := sourceColumn.node.Type()
   164  			targetType := targetColumn.node.Type()
   165  			if sourceType.Kind() != targetType.Kind() {
   166  				return nil, &ConvertError{Path: path, From: sourceColumn.node, To: targetColumn.node}
   167  			}
   168  
   169  			sourceRepetition := fieldRepetitionTypeOf(sourceColumn.node)
   170  			targetRepetition := fieldRepetitionTypeOf(targetColumn.node)
   171  			if sourceRepetition != targetRepetition {
   172  				return nil, &ConvertError{Path: path, From: sourceColumn.node, To: targetColumn.node}
   173  			}
   174  		}
   175  
   176  		sourceToTargetIndex[i] = targetColumn.columnIndex
   177  	}
   178  
   179  	return &conversion{
   180  		targetColumnKinds:   targetColumnKinds,
   181  		targetToSourceIndex: targetToSourceIndex,
   182  		sourceToTargetIndex: sourceToTargetIndex,
   183  		schema:              schema,
   184  	}, nil
   185  }
   186  
   187  // ConvertRowGroup constructs a wrapper of the given row group which applies
   188  // the given schema conversion to its rows.
   189  func ConvertRowGroup(rowGroup RowGroup, conv Conversion) RowGroup {
   190  	schema := conv.Schema()
   191  	numRows := rowGroup.NumRows()
   192  	rowGroupColumns := rowGroup.ColumnChunks()
   193  
   194  	columns := make([]ColumnChunk, numLeafColumnsOf(schema))
   195  	forEachLeafColumnOf(schema, func(leaf leafColumn) {
   196  		i := leaf.columnIndex
   197  		j := conv.Column(int(leaf.columnIndex))
   198  		if j < 0 {
   199  			columns[i] = &missingColumnChunk{
   200  				typ:    leaf.node.Type(),
   201  				column: i,
   202  				// TODO: we assume the number of values is the same as the
   203  				// number of rows, which may not be accurate when the column is
   204  				// part of a repeated group; neighbor columns may be repeated in
   205  				// which case it would be impossible for this chunk not to be.
   206  				numRows:   numRows,
   207  				numValues: numRows,
   208  				numNulls:  numRows,
   209  			}
   210  		} else {
   211  			columns[i] = rowGroupColumns[j]
   212  		}
   213  	})
   214  
   215  	// Sorting columns must exist on the conversion schema in order to be
   216  	// advertised on the converted row group otherwise the resulting rows
   217  	// would not be in the right order.
   218  	sorting := []SortingColumn{}
   219  	for _, col := range rowGroup.SortingColumns() {
   220  		if !hasColumnPath(schema, col.Path()) {
   221  			break
   222  		}
   223  		sorting = append(sorting, col)
   224  	}
   225  
   226  	return &convertedRowGroup{
   227  		// The pair of rowGroup+conv is retained to construct a converted row
   228  		// reader by wrapping the underlying row reader of the row group because
   229  		// it allows proper reconstruction of the repetition and definition
   230  		// levels.
   231  		//
   232  		// TODO: can we figure out how to set the repetition and definition
   233  		// levels when reading values from missing column pages? At first sight
   234  		// it appears complex to do, however:
   235  		//
   236  		// * It is possible that having these levels when reading values of
   237  		//   missing column pages is not necessary in some scenarios (e.g. when
   238  		//   merging row groups).
   239  		//
   240  		// * We may be able to assume the repetition and definition levels at
   241  		//   the call site (e.g. in the functions reading rows from columns).
   242  		//
   243  		// Columns of the source row group which do not exist in the target are
   244  		// masked to prevent loading unneeded pages when reading rows from the
   245  		// converted row group.
   246  		rowGroup: maskMissingRowGroupColumns(rowGroup, len(columns), conv),
   247  		columns:  columns,
   248  		sorting:  sorting,
   249  		conv:     conv,
   250  	}
   251  }
   252  
   253  func maskMissingRowGroupColumns(r RowGroup, numColumns int, conv Conversion) RowGroup {
   254  	rowGroupColumns := r.ColumnChunks()
   255  	columns := make([]ColumnChunk, len(rowGroupColumns))
   256  	missing := make([]missingColumnChunk, len(columns))
   257  	numRows := r.NumRows()
   258  
   259  	for i := range missing {
   260  		missing[i] = missingColumnChunk{
   261  			typ:       rowGroupColumns[i].Type(),
   262  			column:    int16(i),
   263  			numRows:   numRows,
   264  			numValues: numRows,
   265  			numNulls:  numRows,
   266  		}
   267  	}
   268  
   269  	for i := range columns {
   270  		columns[i] = &missing[i]
   271  	}
   272  
   273  	for i := 0; i < numColumns; i++ {
   274  		j := conv.Column(i)
   275  		if j >= 0 && j < len(columns) {
   276  			columns[j] = rowGroupColumns[j]
   277  		}
   278  	}
   279  
   280  	return &rowGroup{
   281  		schema:  r.Schema(),
   282  		numRows: numRows,
   283  		columns: columns,
   284  	}
   285  }
   286  
   287  type missingColumnChunk struct {
   288  	typ       Type
   289  	column    int16
   290  	numRows   int64
   291  	numValues int64
   292  	numNulls  int64
   293  }
   294  
   295  func (c *missingColumnChunk) Type() Type               { return c.typ }
   296  func (c *missingColumnChunk) Column() int              { return int(c.column) }
   297  func (c *missingColumnChunk) Pages() Pages             { return onePage(missingPage{c}) }
   298  func (c *missingColumnChunk) ColumnIndex() ColumnIndex { return missingColumnIndex{c} }
   299  func (c *missingColumnChunk) OffsetIndex() OffsetIndex { return missingOffsetIndex{} }
   300  func (c *missingColumnChunk) BloomFilter() BloomFilter { return missingBloomFilter{} }
   301  func (c *missingColumnChunk) NumValues() int64         { return 0 }
   302  
   303  type missingColumnIndex struct{ *missingColumnChunk }
   304  
   305  func (i missingColumnIndex) NumPages() int       { return 1 }
   306  func (i missingColumnIndex) NullCount(int) int64 { return i.numNulls }
   307  func (i missingColumnIndex) NullPage(int) bool   { return true }
   308  func (i missingColumnIndex) MinValue(int) Value  { return Value{} }
   309  func (i missingColumnIndex) MaxValue(int) Value  { return Value{} }
   310  func (i missingColumnIndex) IsAscending() bool   { return true }
   311  func (i missingColumnIndex) IsDescending() bool  { return false }
   312  
   313  type missingOffsetIndex struct{}
   314  
   315  func (missingOffsetIndex) NumPages() int                { return 1 }
   316  func (missingOffsetIndex) Offset(int) int64             { return 0 }
   317  func (missingOffsetIndex) CompressedPageSize(int) int64 { return 0 }
   318  func (missingOffsetIndex) FirstRowIndex(int) int64      { return 0 }
   319  
   320  type missingBloomFilter struct{}
   321  
   322  func (missingBloomFilter) ReadAt([]byte, int64) (int, error) { return 0, io.EOF }
   323  func (missingBloomFilter) Size() int64                       { return 0 }
   324  func (missingBloomFilter) Check(Value) (bool, error)         { return false, nil }
   325  
   326  type missingPage struct{ *missingColumnChunk }
   327  
   328  func (p missingPage) Column() int                       { return int(p.column) }
   329  func (p missingPage) Dictionary() Dictionary            { return nil }
   330  func (p missingPage) NumRows() int64                    { return p.numRows }
   331  func (p missingPage) NumValues() int64                  { return p.numValues }
   332  func (p missingPage) NumNulls() int64                   { return p.numNulls }
   333  func (p missingPage) Bounds() (min, max Value, ok bool) { return }
   334  func (p missingPage) Size() int64                       { return 0 }
   335  func (p missingPage) Values() ValueReader               { return &missingPageValues{page: p} }
   336  func (p missingPage) Buffer() BufferedPage {
   337  	return newErrorPage(p.Type(), p.Column(), "cannot buffer missing page")
   338  }
   339  
   340  type missingPageValues struct {
   341  	page missingPage
   342  	read int64
   343  }
   344  
   345  func (r *missingPageValues) ReadValues(values []Value) (int, error) {
   346  	remain := r.page.numValues - r.read
   347  	if int64(len(values)) > remain {
   348  		values = values[:remain]
   349  	}
   350  	for i := range values {
   351  		// TODO: how do we set the repetition and definition levels here?
   352  		values[i] = Value{columnIndex: ^r.page.column}
   353  	}
   354  	if r.read += int64(len(values)); r.read == r.page.numValues {
   355  		return len(values), io.EOF
   356  	}
   357  	return len(values), nil
   358  }
   359  
   360  func (r *missingPageValues) Close() error {
   361  	r.read = r.page.numValues
   362  	return nil
   363  }
   364  
   365  type convertedRowGroup struct {
   366  	rowGroup RowGroup
   367  	columns  []ColumnChunk
   368  	sorting  []SortingColumn
   369  	conv     Conversion
   370  }
   371  
   372  func (c *convertedRowGroup) NumRows() int64                  { return c.rowGroup.NumRows() }
   373  func (c *convertedRowGroup) ColumnChunks() []ColumnChunk     { return c.columns }
   374  func (c *convertedRowGroup) Schema() *Schema                 { return c.conv.Schema() }
   375  func (c *convertedRowGroup) SortingColumns() []SortingColumn { return c.sorting }
   376  func (c *convertedRowGroup) Rows() Rows {
   377  	rows := c.rowGroup.Rows()
   378  	return &convertedRows{
   379  		Closer: rows,
   380  		rows:   rows,
   381  		conv:   c.conv,
   382  	}
   383  }
   384  
   385  // ConvertRowReader constructs a wrapper of the given row reader which applies
   386  // the given schema conversion to the rows.
   387  func ConvertRowReader(rows RowReader, conv Conversion) RowReaderWithSchema {
   388  	return &convertedRows{rows: &forwardRowSeeker{rows: rows}, conv: conv}
   389  }
   390  
   391  type convertedRows struct {
   392  	io.Closer
   393  	rows RowReadSeeker
   394  	buf  Row
   395  	conv Conversion
   396  }
   397  
   398  func (c *convertedRows) ReadRows(rows []Row) (int, error) {
   399  	maxRowLen := 0
   400  	defer func() {
   401  		clearValues(c.buf[:maxRowLen])
   402  	}()
   403  
   404  	n, err := c.rows.ReadRows(rows)
   405  
   406  	for i, row := range rows[:n] {
   407  		var err error
   408  		c.buf, err = c.conv.Convert(c.buf[:0], row)
   409  		if len(c.buf) > maxRowLen {
   410  			maxRowLen = len(c.buf)
   411  		}
   412  		if err != nil {
   413  			return i, err
   414  		}
   415  		rows[i] = append(row[:0], c.buf...)
   416  	}
   417  
   418  	return n, err
   419  }
   420  
   421  func (c *convertedRows) Schema() *Schema {
   422  	return c.conv.Schema()
   423  }
   424  
   425  func (c *convertedRows) SeekToRow(rowIndex int64) error {
   426  	return c.rows.SeekToRow(rowIndex)
   427  }