github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_builder.go (about) 1 package parquet 2 3 // RowBuilder is a type which helps build parquet rows incrementally by adding 4 // values to columns. 5 type RowBuilder struct { 6 columns [][]Value 7 models []Value 8 levels []columnLevel 9 groups []*columnGroup 10 } 11 12 type columnLevel struct { 13 repetitionDepth byte 14 repetitionLevel byte 15 definitionLevel byte 16 } 17 18 type columnGroup struct { 19 baseColumn []Value 20 members []int16 21 startIndex int16 22 endIndex int16 23 repetitionLevel byte 24 definitionLevel byte 25 } 26 27 // NewRowBuilder constructs a RowBuilder which builds rows for the parquet 28 // schema passed as argument. 29 func NewRowBuilder(schema Node) *RowBuilder { 30 if schema.Leaf() { 31 panic("schema of row builder must be a group") 32 } 33 n := numLeafColumnsOf(schema) 34 b := &RowBuilder{ 35 columns: make([][]Value, n), 36 models: make([]Value, n), 37 levels: make([]columnLevel, n), 38 } 39 buffers := make([]Value, len(b.columns)) 40 for i := range b.columns { 41 b.columns[i] = buffers[i : i : i+1] 42 } 43 topGroup := &columnGroup{baseColumn: []Value{{}}} 44 endIndex := b.configure(schema, 0, columnLevel{}, topGroup) 45 topGroup.endIndex = endIndex 46 b.groups = append(b.groups, topGroup) 47 return b 48 } 49 50 func (b *RowBuilder) configure(node Node, columnIndex int16, level columnLevel, group *columnGroup) (endIndex int16) { 51 switch { 52 case node.Optional(): 53 level.definitionLevel++ 54 endIndex = b.configure(Required(node), columnIndex, level, group) 55 56 for i := columnIndex; i < endIndex; i++ { 57 b.models[i].kind = 0 // null if not set 58 b.models[i].ptr = nil 59 b.models[i].u64 = 0 60 } 61 62 case node.Repeated(): 63 level.definitionLevel++ 64 65 group = &columnGroup{ 66 startIndex: columnIndex, 67 repetitionLevel: level.repetitionDepth, 68 definitionLevel: level.definitionLevel, 69 } 70 71 level.repetitionDepth++ 72 endIndex = b.configure(Required(node), columnIndex, level, group) 73 74 for i := columnIndex; i < endIndex; i++ { 75 b.models[i].kind = 0 // null if not set 76 b.models[i].ptr = nil 77 b.models[i].u64 = 0 78 } 79 80 group.endIndex = endIndex 81 b.groups = append(b.groups, group) 82 83 case node.Leaf(): 84 typ := node.Type() 85 kind := typ.Kind() 86 model := makeValueKind(kind) 87 model.repetitionLevel = level.repetitionLevel 88 model.definitionLevel = level.definitionLevel 89 // FIXED_LEN_BYTE_ARRAY is the only type which needs to be given a 90 // non-nil zero-value if the field is required. 91 if kind == FixedLenByteArray { 92 zero := make([]byte, typ.Length()) 93 model.ptr = &zero[0] 94 model.u64 = uint64(len(zero)) 95 } 96 group.members = append(group.members, columnIndex) 97 b.models[columnIndex] = model 98 b.levels[columnIndex] = level 99 endIndex = columnIndex + 1 100 101 default: 102 endIndex = columnIndex 103 104 for _, field := range node.Fields() { 105 endIndex = b.configure(field, endIndex, level, group) 106 } 107 } 108 return endIndex 109 } 110 111 // Add adds columnValue to the column at columnIndex. 112 func (b *RowBuilder) Add(columnIndex int, columnValue Value) { 113 level := &b.levels[columnIndex] 114 columnValue.repetitionLevel = level.repetitionLevel 115 columnValue.definitionLevel = level.definitionLevel 116 columnValue.columnIndex = ^int16(columnIndex) 117 level.repetitionLevel = level.repetitionDepth 118 b.columns[columnIndex] = append(b.columns[columnIndex], columnValue) 119 } 120 121 // Next must be called to indicate the start of a new repeated record for the 122 // column at the given index. 123 // 124 // If the column index is part of a repeated group, the builder automatically 125 // starts a new record for all adjacent columns, the application does not need 126 // to call this method for each column of the repeated group. 127 // 128 // Next must be called after adding a sequence of records. 129 func (b *RowBuilder) Next(columnIndex int) { 130 for _, group := range b.groups { 131 if group.startIndex <= int16(columnIndex) && int16(columnIndex) < group.endIndex { 132 for i := group.startIndex; i < group.endIndex; i++ { 133 if level := &b.levels[i]; level.repetitionLevel != 0 { 134 level.repetitionLevel = group.repetitionLevel 135 } 136 } 137 break 138 } 139 } 140 } 141 142 // Reset clears the internal state of b, making it possible to reuse while 143 // retaining the internal buffers. 144 func (b *RowBuilder) Reset() { 145 for i, column := range b.columns { 146 clearValues(column) 147 b.columns[i] = column[:0] 148 } 149 for i := range b.levels { 150 b.levels[i].repetitionLevel = 0 151 } 152 } 153 154 // Row materializes the current state of b into a parquet row. 155 func (b *RowBuilder) Row() Row { 156 numValues := 0 157 for _, column := range b.columns { 158 numValues += len(column) 159 } 160 return b.AppendRow(make(Row, 0, numValues)) 161 } 162 163 // AppendRow appends the current state of b to row and returns it. 164 func (b *RowBuilder) AppendRow(row Row) Row { 165 for _, group := range b.groups { 166 maxColumn := group.baseColumn 167 168 for _, columnIndex := range group.members { 169 if column := b.columns[columnIndex]; len(column) > len(maxColumn) { 170 maxColumn = column 171 } 172 } 173 174 if len(maxColumn) != 0 { 175 columns := b.columns[group.startIndex:group.endIndex] 176 177 for i, column := range columns { 178 if len(column) < len(maxColumn) { 179 n := len(column) 180 column = append(column, maxColumn[n:]...) 181 182 columnIndex := group.startIndex + int16(i) 183 model := b.models[columnIndex] 184 185 for n < len(column) { 186 v := &column[n] 187 v.kind = model.kind 188 v.ptr = model.ptr 189 v.u64 = model.u64 190 v.definitionLevel = group.definitionLevel 191 v.columnIndex = ^columnIndex 192 n++ 193 } 194 195 columns[i] = column 196 } 197 } 198 } 199 } 200 201 return appendRow(row, b.columns) 202 }