github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_builder.go (about)

     1  package parquet
     2  
     3  // RowBuilder is a type which helps build parquet rows incrementally by adding
     4  // values to columns.
     5  type RowBuilder struct {
     6  	columns [][]Value
     7  	models  []Value
     8  	levels  []columnLevel
     9  	groups  []*columnGroup
    10  }
    11  
    12  type columnLevel struct {
    13  	repetitionDepth byte
    14  	repetitionLevel byte
    15  	definitionLevel byte
    16  }
    17  
    18  type columnGroup struct {
    19  	baseColumn      []Value
    20  	members         []int16
    21  	startIndex      int16
    22  	endIndex        int16
    23  	repetitionLevel byte
    24  	definitionLevel byte
    25  }
    26  
    27  // NewRowBuilder constructs a RowBuilder which builds rows for the parquet
    28  // schema passed as argument.
    29  func NewRowBuilder(schema Node) *RowBuilder {
    30  	if schema.Leaf() {
    31  		panic("schema of row builder must be a group")
    32  	}
    33  	n := numLeafColumnsOf(schema)
    34  	b := &RowBuilder{
    35  		columns: make([][]Value, n),
    36  		models:  make([]Value, n),
    37  		levels:  make([]columnLevel, n),
    38  	}
    39  	buffers := make([]Value, len(b.columns))
    40  	for i := range b.columns {
    41  		b.columns[i] = buffers[i : i : i+1]
    42  	}
    43  	topGroup := &columnGroup{baseColumn: []Value{{}}}
    44  	endIndex := b.configure(schema, 0, columnLevel{}, topGroup)
    45  	topGroup.endIndex = endIndex
    46  	b.groups = append(b.groups, topGroup)
    47  	return b
    48  }
    49  
    50  func (b *RowBuilder) configure(node Node, columnIndex int16, level columnLevel, group *columnGroup) (endIndex int16) {
    51  	switch {
    52  	case node.Optional():
    53  		level.definitionLevel++
    54  		endIndex = b.configure(Required(node), columnIndex, level, group)
    55  
    56  		for i := columnIndex; i < endIndex; i++ {
    57  			b.models[i].kind = 0 // null if not set
    58  			b.models[i].ptr = nil
    59  			b.models[i].u64 = 0
    60  		}
    61  
    62  	case node.Repeated():
    63  		level.definitionLevel++
    64  
    65  		group = &columnGroup{
    66  			startIndex:      columnIndex,
    67  			repetitionLevel: level.repetitionDepth,
    68  			definitionLevel: level.definitionLevel,
    69  		}
    70  
    71  		level.repetitionDepth++
    72  		endIndex = b.configure(Required(node), columnIndex, level, group)
    73  
    74  		for i := columnIndex; i < endIndex; i++ {
    75  			b.models[i].kind = 0 // null if not set
    76  			b.models[i].ptr = nil
    77  			b.models[i].u64 = 0
    78  		}
    79  
    80  		group.endIndex = endIndex
    81  		b.groups = append(b.groups, group)
    82  
    83  	case node.Leaf():
    84  		typ := node.Type()
    85  		kind := typ.Kind()
    86  		model := makeValueKind(kind)
    87  		model.repetitionLevel = level.repetitionLevel
    88  		model.definitionLevel = level.definitionLevel
    89  		// FIXED_LEN_BYTE_ARRAY is the only type which needs to be given a
    90  		// non-nil zero-value if the field is required.
    91  		if kind == FixedLenByteArray {
    92  			zero := make([]byte, typ.Length())
    93  			model.ptr = &zero[0]
    94  			model.u64 = uint64(len(zero))
    95  		}
    96  		group.members = append(group.members, columnIndex)
    97  		b.models[columnIndex] = model
    98  		b.levels[columnIndex] = level
    99  		endIndex = columnIndex + 1
   100  
   101  	default:
   102  		endIndex = columnIndex
   103  
   104  		for _, field := range node.Fields() {
   105  			endIndex = b.configure(field, endIndex, level, group)
   106  		}
   107  	}
   108  	return endIndex
   109  }
   110  
   111  // Add adds columnValue to the column at columnIndex.
   112  func (b *RowBuilder) Add(columnIndex int, columnValue Value) {
   113  	level := &b.levels[columnIndex]
   114  	columnValue.repetitionLevel = level.repetitionLevel
   115  	columnValue.definitionLevel = level.definitionLevel
   116  	columnValue.columnIndex = ^int16(columnIndex)
   117  	level.repetitionLevel = level.repetitionDepth
   118  	b.columns[columnIndex] = append(b.columns[columnIndex], columnValue)
   119  }
   120  
   121  // Next must be called to indicate the start of a new repeated record for the
   122  // column at the given index.
   123  //
   124  // If the column index is part of a repeated group, the builder automatically
   125  // starts a new record for all adjacent columns, the application does not need
   126  // to call this method for each column of the repeated group.
   127  //
   128  // Next must be called after adding a sequence of records.
   129  func (b *RowBuilder) Next(columnIndex int) {
   130  	for _, group := range b.groups {
   131  		if group.startIndex <= int16(columnIndex) && int16(columnIndex) < group.endIndex {
   132  			for i := group.startIndex; i < group.endIndex; i++ {
   133  				if level := &b.levels[i]; level.repetitionLevel != 0 {
   134  					level.repetitionLevel = group.repetitionLevel
   135  				}
   136  			}
   137  			break
   138  		}
   139  	}
   140  }
   141  
   142  // Reset clears the internal state of b, making it possible to reuse while
   143  // retaining the internal buffers.
   144  func (b *RowBuilder) Reset() {
   145  	for i, column := range b.columns {
   146  		clearValues(column)
   147  		b.columns[i] = column[:0]
   148  	}
   149  	for i := range b.levels {
   150  		b.levels[i].repetitionLevel = 0
   151  	}
   152  }
   153  
   154  // Row materializes the current state of b into a parquet row.
   155  func (b *RowBuilder) Row() Row {
   156  	numValues := 0
   157  	for _, column := range b.columns {
   158  		numValues += len(column)
   159  	}
   160  	return b.AppendRow(make(Row, 0, numValues))
   161  }
   162  
   163  // AppendRow appends the current state of b to row and returns it.
   164  func (b *RowBuilder) AppendRow(row Row) Row {
   165  	for _, group := range b.groups {
   166  		maxColumn := group.baseColumn
   167  
   168  		for _, columnIndex := range group.members {
   169  			if column := b.columns[columnIndex]; len(column) > len(maxColumn) {
   170  				maxColumn = column
   171  			}
   172  		}
   173  
   174  		if len(maxColumn) != 0 {
   175  			columns := b.columns[group.startIndex:group.endIndex]
   176  
   177  			for i, column := range columns {
   178  				if len(column) < len(maxColumn) {
   179  					n := len(column)
   180  					column = append(column, maxColumn[n:]...)
   181  
   182  					columnIndex := group.startIndex + int16(i)
   183  					model := b.models[columnIndex]
   184  
   185  					for n < len(column) {
   186  						v := &column[n]
   187  						v.kind = model.kind
   188  						v.ptr = model.ptr
   189  						v.u64 = model.u64
   190  						v.definitionLevel = group.definitionLevel
   191  						v.columnIndex = ^columnIndex
   192  						n++
   193  					}
   194  
   195  					columns[i] = column
   196  				}
   197  			}
   198  		}
   199  	}
   200  
   201  	return appendRow(row, b.columns)
   202  }