github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/convert.go (about) 1 package parquet 2 3 import ( 4 "fmt" 5 "io" 6 "sync" 7 ) 8 9 // ConvertError is an error type returned by calls to Convert when the conversion 10 // of parquet schemas is impossible or the input row for the conversion is 11 // malformed. 12 type ConvertError struct { 13 Path []string 14 From Node 15 To Node 16 } 17 18 // Error satisfies the error interface. 19 func (e *ConvertError) Error() string { 20 sourceType := e.From.Type() 21 targetType := e.To.Type() 22 23 sourceRepetition := fieldRepetitionTypeOf(e.From) 24 targetRepetition := fieldRepetitionTypeOf(e.To) 25 26 return fmt.Sprintf("cannot convert parquet column %q from %s %s to %s %s", 27 columnPath(e.Path), 28 sourceRepetition, 29 sourceType, 30 targetRepetition, 31 targetType, 32 ) 33 } 34 35 // Conversion is an interface implemented by types that provide conversion of 36 // parquet rows from one schema to another. 37 // 38 // Conversion instances must be safe to use concurrently from multiple goroutines. 39 type Conversion interface { 40 // Applies the conversion logic on the src row, returning the result 41 // appended to dst. 42 Convert(dst, src Row) (Row, error) 43 // Converts the given column index in the target schema to the original 44 // column index in the source schema of the conversion. 45 Column(int) int 46 // Returns the target schema of the conversion. 47 Schema() *Schema 48 } 49 50 type conversion struct { 51 targetColumnKinds []Kind 52 targetToSourceIndex []int16 53 sourceToTargetIndex []int16 54 schema *Schema 55 buffers sync.Pool 56 } 57 58 type conversionBuffer struct { 59 columns [][]Value 60 } 61 62 func (c *conversion) getBuffer() *conversionBuffer { 63 b, _ := c.buffers.Get().(*conversionBuffer) 64 if b == nil { 65 n := len(c.targetColumnKinds) 66 columns, values := make([][]Value, n), make([]Value, n) 67 for i := range columns { 68 columns[i] = values[i : i : i+1] 69 } 70 b = &conversionBuffer{columns: columns} 71 } 72 return b 73 } 74 75 func (c *conversion) putBuffer(b *conversionBuffer) { 76 for i, values := range b.columns { 77 clearValues(values) 78 b.columns[i] = values[:0] 79 } 80 c.buffers.Put(b) 81 } 82 83 func (c *conversion) Convert(target, source Row) (Row, error) { 84 buffer := c.getBuffer() 85 defer c.putBuffer(buffer) 86 87 for _, value := range source { 88 sourceIndex := value.Column() 89 targetIndex := c.sourceToTargetIndex[sourceIndex] 90 if targetIndex >= 0 { 91 value.kind = ^int8(c.targetColumnKinds[targetIndex]) 92 value.columnIndex = ^targetIndex 93 buffer.columns[targetIndex] = append(buffer.columns[targetIndex], value) 94 } 95 } 96 97 for i, values := range buffer.columns { 98 if len(values) == 0 { 99 values = append(values, Value{ 100 kind: ^int8(c.targetColumnKinds[i]), 101 columnIndex: ^int16(i), 102 }) 103 } 104 target = append(target, values...) 105 } 106 107 return target, nil 108 } 109 110 func (c *conversion) Column(i int) int { 111 return int(c.targetToSourceIndex[i]) 112 } 113 114 func (c *conversion) Schema() *Schema { 115 return c.schema 116 } 117 118 type identity struct{ schema *Schema } 119 120 func (id identity) Convert(dst, src Row) (Row, error) { return append(dst, src...), nil } 121 func (id identity) Column(i int) int { return i } 122 func (id identity) Schema() *Schema { return id.schema } 123 124 // Convert constructs a conversion function from one parquet schema to another. 125 // 126 // The function supports converting between schemas where the source or target 127 // have extra columns; if there are more columns in the source, they will be 128 // stripped out of the rows. Extra columns in the target schema will be set to 129 // null or zero values. 130 // 131 // The returned function is intended to be used to append the converted source 132 // row to the destination buffer. 133 func Convert(to, from Node) (conv Conversion, err error) { 134 schema, _ := to.(*Schema) 135 if schema == nil { 136 schema = NewSchema("", to) 137 } 138 139 if nodesAreEqual(to, from) { 140 return identity{schema}, nil 141 } 142 143 targetMapping, targetColumns := columnMappingOf(to) 144 sourceMapping, sourceColumns := columnMappingOf(from) 145 146 columnIndexBuffer := make([]int16, len(targetColumns)+len(sourceColumns)) 147 targetColumnKinds := make([]Kind, len(targetColumns)) 148 targetToSourceIndex := columnIndexBuffer[:len(targetColumns)] 149 sourceToTargetIndex := columnIndexBuffer[len(targetColumns):] 150 151 for i, path := range targetColumns { 152 sourceColumn := sourceMapping.lookup(path) 153 targetColumn := targetMapping.lookup(path) 154 targetToSourceIndex[i] = sourceColumn.columnIndex 155 targetColumnKinds[i] = targetColumn.node.Type().Kind() 156 } 157 158 for i, path := range sourceColumns { 159 sourceColumn := sourceMapping.lookup(path) 160 targetColumn := targetMapping.lookup(path) 161 162 if targetColumn.node != nil { 163 sourceType := sourceColumn.node.Type() 164 targetType := targetColumn.node.Type() 165 if sourceType.Kind() != targetType.Kind() { 166 return nil, &ConvertError{Path: path, From: sourceColumn.node, To: targetColumn.node} 167 } 168 169 sourceRepetition := fieldRepetitionTypeOf(sourceColumn.node) 170 targetRepetition := fieldRepetitionTypeOf(targetColumn.node) 171 if sourceRepetition != targetRepetition { 172 return nil, &ConvertError{Path: path, From: sourceColumn.node, To: targetColumn.node} 173 } 174 } 175 176 sourceToTargetIndex[i] = targetColumn.columnIndex 177 } 178 179 return &conversion{ 180 targetColumnKinds: targetColumnKinds, 181 targetToSourceIndex: targetToSourceIndex, 182 sourceToTargetIndex: sourceToTargetIndex, 183 schema: schema, 184 }, nil 185 } 186 187 // ConvertRowGroup constructs a wrapper of the given row group which applies 188 // the given schema conversion to its rows. 189 func ConvertRowGroup(rowGroup RowGroup, conv Conversion) RowGroup { 190 schema := conv.Schema() 191 numRows := rowGroup.NumRows() 192 rowGroupColumns := rowGroup.ColumnChunks() 193 194 columns := make([]ColumnChunk, numLeafColumnsOf(schema)) 195 forEachLeafColumnOf(schema, func(leaf leafColumn) { 196 i := leaf.columnIndex 197 j := conv.Column(int(leaf.columnIndex)) 198 if j < 0 { 199 columns[i] = &missingColumnChunk{ 200 typ: leaf.node.Type(), 201 column: i, 202 // TODO: we assume the number of values is the same as the 203 // number of rows, which may not be accurate when the column is 204 // part of a repeated group; neighbor columns may be repeated in 205 // which case it would be impossible for this chunk not to be. 206 numRows: numRows, 207 numValues: numRows, 208 numNulls: numRows, 209 } 210 } else { 211 columns[i] = rowGroupColumns[j] 212 } 213 }) 214 215 // Sorting columns must exist on the conversion schema in order to be 216 // advertised on the converted row group otherwise the resulting rows 217 // would not be in the right order. 218 sorting := []SortingColumn{} 219 for _, col := range rowGroup.SortingColumns() { 220 if !hasColumnPath(schema, col.Path()) { 221 break 222 } 223 sorting = append(sorting, col) 224 } 225 226 return &convertedRowGroup{ 227 // The pair of rowGroup+conv is retained to construct a converted row 228 // reader by wrapping the underlying row reader of the row group because 229 // it allows proper reconstruction of the repetition and definition 230 // levels. 231 // 232 // TODO: can we figure out how to set the repetition and definition 233 // levels when reading values from missing column pages? At first sight 234 // it appears complex to do, however: 235 // 236 // * It is possible that having these levels when reading values of 237 // missing column pages is not necessary in some scenarios (e.g. when 238 // merging row groups). 239 // 240 // * We may be able to assume the repetition and definition levels at 241 // the call site (e.g. in the functions reading rows from columns). 242 // 243 // Columns of the source row group which do not exist in the target are 244 // masked to prevent loading unneeded pages when reading rows from the 245 // converted row group. 246 rowGroup: maskMissingRowGroupColumns(rowGroup, len(columns), conv), 247 columns: columns, 248 sorting: sorting, 249 conv: conv, 250 } 251 } 252 253 func maskMissingRowGroupColumns(r RowGroup, numColumns int, conv Conversion) RowGroup { 254 rowGroupColumns := r.ColumnChunks() 255 columns := make([]ColumnChunk, len(rowGroupColumns)) 256 missing := make([]missingColumnChunk, len(columns)) 257 numRows := r.NumRows() 258 259 for i := range missing { 260 missing[i] = missingColumnChunk{ 261 typ: rowGroupColumns[i].Type(), 262 column: int16(i), 263 numRows: numRows, 264 numValues: numRows, 265 numNulls: numRows, 266 } 267 } 268 269 for i := range columns { 270 columns[i] = &missing[i] 271 } 272 273 for i := 0; i < numColumns; i++ { 274 j := conv.Column(i) 275 if j >= 0 && j < len(columns) { 276 columns[j] = rowGroupColumns[j] 277 } 278 } 279 280 return &rowGroup{ 281 schema: r.Schema(), 282 numRows: numRows, 283 columns: columns, 284 } 285 } 286 287 type missingColumnChunk struct { 288 typ Type 289 column int16 290 numRows int64 291 numValues int64 292 numNulls int64 293 } 294 295 func (c *missingColumnChunk) Type() Type { return c.typ } 296 func (c *missingColumnChunk) Column() int { return int(c.column) } 297 func (c *missingColumnChunk) Pages() Pages { return onePage(missingPage{c}) } 298 func (c *missingColumnChunk) ColumnIndex() ColumnIndex { return missingColumnIndex{c} } 299 func (c *missingColumnChunk) OffsetIndex() OffsetIndex { return missingOffsetIndex{} } 300 func (c *missingColumnChunk) BloomFilter() BloomFilter { return missingBloomFilter{} } 301 func (c *missingColumnChunk) NumValues() int64 { return 0 } 302 303 type missingColumnIndex struct{ *missingColumnChunk } 304 305 func (i missingColumnIndex) NumPages() int { return 1 } 306 func (i missingColumnIndex) NullCount(int) int64 { return i.numNulls } 307 func (i missingColumnIndex) NullPage(int) bool { return true } 308 func (i missingColumnIndex) MinValue(int) Value { return Value{} } 309 func (i missingColumnIndex) MaxValue(int) Value { return Value{} } 310 func (i missingColumnIndex) IsAscending() bool { return true } 311 func (i missingColumnIndex) IsDescending() bool { return false } 312 313 type missingOffsetIndex struct{} 314 315 func (missingOffsetIndex) NumPages() int { return 1 } 316 func (missingOffsetIndex) Offset(int) int64 { return 0 } 317 func (missingOffsetIndex) CompressedPageSize(int) int64 { return 0 } 318 func (missingOffsetIndex) FirstRowIndex(int) int64 { return 0 } 319 320 type missingBloomFilter struct{} 321 322 func (missingBloomFilter) ReadAt([]byte, int64) (int, error) { return 0, io.EOF } 323 func (missingBloomFilter) Size() int64 { return 0 } 324 func (missingBloomFilter) Check(Value) (bool, error) { return false, nil } 325 326 type missingPage struct{ *missingColumnChunk } 327 328 func (p missingPage) Column() int { return int(p.column) } 329 func (p missingPage) Dictionary() Dictionary { return nil } 330 func (p missingPage) NumRows() int64 { return p.numRows } 331 func (p missingPage) NumValues() int64 { return p.numValues } 332 func (p missingPage) NumNulls() int64 { return p.numNulls } 333 func (p missingPage) Bounds() (min, max Value, ok bool) { return } 334 func (p missingPage) Size() int64 { return 0 } 335 func (p missingPage) Values() ValueReader { return &missingPageValues{page: p} } 336 func (p missingPage) Buffer() BufferedPage { 337 return newErrorPage(p.Type(), p.Column(), "cannot buffer missing page") 338 } 339 340 type missingPageValues struct { 341 page missingPage 342 read int64 343 } 344 345 func (r *missingPageValues) ReadValues(values []Value) (int, error) { 346 remain := r.page.numValues - r.read 347 if int64(len(values)) > remain { 348 values = values[:remain] 349 } 350 for i := range values { 351 // TODO: how do we set the repetition and definition levels here? 352 values[i] = Value{columnIndex: ^r.page.column} 353 } 354 if r.read += int64(len(values)); r.read == r.page.numValues { 355 return len(values), io.EOF 356 } 357 return len(values), nil 358 } 359 360 func (r *missingPageValues) Close() error { 361 r.read = r.page.numValues 362 return nil 363 } 364 365 type convertedRowGroup struct { 366 rowGroup RowGroup 367 columns []ColumnChunk 368 sorting []SortingColumn 369 conv Conversion 370 } 371 372 func (c *convertedRowGroup) NumRows() int64 { return c.rowGroup.NumRows() } 373 func (c *convertedRowGroup) ColumnChunks() []ColumnChunk { return c.columns } 374 func (c *convertedRowGroup) Schema() *Schema { return c.conv.Schema() } 375 func (c *convertedRowGroup) SortingColumns() []SortingColumn { return c.sorting } 376 func (c *convertedRowGroup) Rows() Rows { 377 rows := c.rowGroup.Rows() 378 return &convertedRows{ 379 Closer: rows, 380 rows: rows, 381 conv: c.conv, 382 } 383 } 384 385 // ConvertRowReader constructs a wrapper of the given row reader which applies 386 // the given schema conversion to the rows. 387 func ConvertRowReader(rows RowReader, conv Conversion) RowReaderWithSchema { 388 return &convertedRows{rows: &forwardRowSeeker{rows: rows}, conv: conv} 389 } 390 391 type convertedRows struct { 392 io.Closer 393 rows RowReadSeeker 394 buf Row 395 conv Conversion 396 } 397 398 func (c *convertedRows) ReadRows(rows []Row) (int, error) { 399 maxRowLen := 0 400 defer func() { 401 clearValues(c.buf[:maxRowLen]) 402 }() 403 404 n, err := c.rows.ReadRows(rows) 405 406 for i, row := range rows[:n] { 407 var err error 408 c.buf, err = c.conv.Convert(c.buf[:0], row) 409 if len(c.buf) > maxRowLen { 410 maxRowLen = len(c.buf) 411 } 412 if err != nil { 413 return i, err 414 } 415 rows[i] = append(row[:0], c.buf...) 416 } 417 418 return n, err 419 } 420 421 func (c *convertedRows) Schema() *Schema { 422 return c.conv.Schema() 423 } 424 425 func (c *convertedRows) SeekToRow(rowIndex int64) error { 426 return c.rows.SeekToRow(rowIndex) 427 }