github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/buffer.go (about) 1 package parquet 2 3 import ( 4 "sort" 5 ) 6 7 // Buffer represents an in-memory group of parquet rows. 8 // 9 // The main purpose of the Buffer type is to provide a way to sort rows before 10 // writing them to a parquet file. Buffer implements sort.Interface as a way 11 // to support reordering the rows that have been written to it. 12 type Buffer struct { 13 config *RowGroupConfig 14 schema *Schema 15 rowbuf []Row 16 colbuf [][]Value 17 chunks []ColumnChunk 18 columns []ColumnBuffer 19 sorted []ColumnBuffer 20 } 21 22 // NewBuffer constructs a new buffer, using the given list of buffer options 23 // to configure the buffer returned by the function. 24 // 25 // The function panics if the buffer configuration is invalid. Programs that 26 // cannot guarantee the validity of the options passed to NewBuffer should 27 // construct the buffer configuration independently prior to calling this 28 // function: 29 // 30 // config, err := parquet.NewRowGroupConfig(options...) 31 // if err != nil { 32 // // handle the configuration error 33 // ... 34 // } else { 35 // // this call to create a buffer is guaranteed not to panic 36 // buffer := parquet.NewBuffer(config) 37 // ... 38 // } 39 // 40 func NewBuffer(options ...RowGroupOption) *Buffer { 41 config, err := NewRowGroupConfig(options...) 42 if err != nil { 43 panic(err) 44 } 45 buf := &Buffer{ 46 config: config, 47 } 48 if config.Schema != nil { 49 buf.configure(config.Schema) 50 } 51 return buf 52 } 53 54 func (buf *Buffer) configure(schema *Schema) { 55 if schema == nil { 56 return 57 } 58 sortingColumns := buf.config.SortingColumns 59 buf.sorted = make([]ColumnBuffer, len(sortingColumns)) 60 61 forEachLeafColumnOf(schema, func(leaf leafColumn) { 62 nullOrdering := nullsGoLast 63 columnIndex := int(leaf.columnIndex) 64 columnType := leaf.node.Type() 65 bufferCap := buf.config.ColumnBufferCapacity 66 dictionary := (Dictionary)(nil) 67 encoding := encodingOf(leaf.node) 68 69 if isDictionaryEncoding(encoding) { 70 dictBuffer := make([]byte, 0, columnType.EstimateSize(bufferCap)) 71 dictionary = columnType.NewDictionary(columnIndex, 0, dictBuffer) 72 columnType = dictionary.Type() 73 } 74 75 column := columnType.NewColumnBuffer(columnIndex, bufferCap) 76 switch { 77 case leaf.maxRepetitionLevel > 0: 78 column = newRepeatedColumnBuffer(column, leaf.maxRepetitionLevel, leaf.maxDefinitionLevel, nullOrdering) 79 case leaf.maxDefinitionLevel > 0: 80 column = newOptionalColumnBuffer(column, leaf.maxDefinitionLevel, nullOrdering) 81 } 82 buf.columns = append(buf.columns, column) 83 84 if sortingIndex := searchSortingColumn(sortingColumns, leaf.path); sortingIndex < len(sortingColumns) { 85 if sortingColumns[sortingIndex].Descending() { 86 column = &reversedColumnBuffer{column} 87 } 88 if sortingColumns[sortingIndex].NullsFirst() { 89 nullOrdering = nullsGoFirst 90 } 91 buf.sorted[sortingIndex] = column 92 } 93 }) 94 95 buf.schema = schema 96 buf.rowbuf = make([]Row, 0, 1) 97 buf.colbuf = make([][]Value, len(buf.columns)) 98 buf.chunks = make([]ColumnChunk, len(buf.columns)) 99 100 for i, column := range buf.columns { 101 buf.chunks[i] = column 102 } 103 } 104 105 // Size returns the estimated size of the buffer in memory (in bytes). 106 func (buf *Buffer) Size() int64 { 107 size := int64(0) 108 for _, col := range buf.columns { 109 size += col.Size() 110 } 111 return size 112 } 113 114 // NumRows returns the number of rows written to the buffer. 115 func (buf *Buffer) NumRows() int64 { return int64(buf.Len()) } 116 117 // ColumnChunks returns the buffer columns. 118 func (buf *Buffer) ColumnChunks() []ColumnChunk { return buf.chunks } 119 120 // ColumnBuffer returns the buffer columns. 121 // 122 // This method is similar to ColumnChunks, but returns a list of ColumnBuffer 123 // instead of a ColumnChunk values (the latter being read-only); calling 124 // ColumnBuffers or ColumnChunks with the same index returns the same underlying 125 // objects, but with different types, which removes the need for making a type 126 // assertion if the program needed to write directly to the column buffers. 127 // The presence of the ColumnChunks method is still required to satisfy the 128 // RowGroup interface. 129 func (buf *Buffer) ColumnBuffers() []ColumnBuffer { return buf.columns } 130 131 // Schema returns the schema of the buffer. 132 // 133 // The schema is either configured by passing a Schema in the option list when 134 // constructing the buffer, or lazily discovered when the first row is written. 135 func (buf *Buffer) Schema() *Schema { return buf.schema } 136 137 // SortingColumns returns the list of columns by which the buffer will be 138 // sorted. 139 // 140 // The sorting order is configured by passing a SortingColumns option when 141 // constructing the buffer. 142 func (buf *Buffer) SortingColumns() []SortingColumn { return buf.config.SortingColumns } 143 144 // Len returns the number of rows written to the buffer. 145 func (buf *Buffer) Len() int { 146 if len(buf.columns) == 0 { 147 return 0 148 } else { 149 // All columns have the same number of rows. 150 return buf.columns[0].Len() 151 } 152 } 153 154 // Less returns true if row[i] < row[j] in the buffer. 155 func (buf *Buffer) Less(i, j int) bool { 156 for _, col := range buf.sorted { 157 switch { 158 case col.Less(i, j): 159 return true 160 case col.Less(j, i): 161 return false 162 } 163 } 164 return false 165 } 166 167 // Swap exchanges the rows at indexes i and j. 168 func (buf *Buffer) Swap(i, j int) { 169 for _, col := range buf.columns { 170 col.Swap(i, j) 171 } 172 } 173 174 // Reset clears the content of the buffer, allowing it to be reused. 175 func (buf *Buffer) Reset() { 176 for _, col := range buf.columns { 177 col.Reset() 178 } 179 } 180 181 // Write writes a row held in a Go value to the buffer. 182 func (buf *Buffer) Write(row interface{}) error { 183 if buf.schema == nil { 184 buf.configure(SchemaOf(row)) 185 } 186 187 buf.rowbuf = buf.rowbuf[:1] 188 defer clearRows(buf.rowbuf) 189 190 buf.rowbuf[0] = buf.schema.Deconstruct(buf.rowbuf[0], row) 191 _, err := buf.WriteRows(buf.rowbuf) 192 return err 193 } 194 195 // WriteRows writes parquet rows to the buffer. 196 func (buf *Buffer) WriteRows(rows []Row) (int, error) { 197 defer func() { 198 for i, colbuf := range buf.colbuf { 199 clearValues(colbuf) 200 buf.colbuf[i] = colbuf[:0] 201 } 202 }() 203 204 if buf.schema == nil { 205 return 0, ErrRowGroupSchemaMissing 206 } 207 208 for _, row := range rows { 209 for _, value := range row { 210 columnIndex := value.Column() 211 buf.colbuf[columnIndex] = append(buf.colbuf[columnIndex], value) 212 } 213 } 214 215 for columnIndex, values := range buf.colbuf { 216 if _, err := buf.columns[columnIndex].WriteValues(values); err != nil { 217 // TOOD: an error at this stage will leave the buffer in an invalid 218 // state since the row was partially written. Applications are not 219 // expected to continue using the buffer after getting an error, 220 // maybe we can enforce it? 221 return 0, err 222 } 223 } 224 225 return len(rows), nil 226 } 227 228 // WriteRowGroup satisfies the RowGroupWriter interface. 229 func (buf *Buffer) WriteRowGroup(rowGroup RowGroup) (int64, error) { 230 rowGroupSchema := rowGroup.Schema() 231 switch { 232 case rowGroupSchema == nil: 233 return 0, ErrRowGroupSchemaMissing 234 case buf.schema == nil: 235 buf.configure(rowGroupSchema) 236 case !nodesAreEqual(buf.schema, rowGroupSchema): 237 return 0, ErrRowGroupSchemaMismatch 238 } 239 if !sortingColumnsHavePrefix(rowGroup.SortingColumns(), buf.SortingColumns()) { 240 return 0, ErrRowGroupSortingColumnsMismatch 241 } 242 n := buf.NumRows() 243 r := rowGroup.Rows() 244 defer r.Close() 245 _, err := CopyRows(bufferWriter{buf}, r) 246 return buf.NumRows() - n, err 247 } 248 249 // Rows returns a reader exposing the current content of the buffer. 250 // 251 // The buffer and the returned reader share memory. Mutating the buffer 252 // concurrently to reading rows may result in non-deterministic behavior. 253 func (buf *Buffer) Rows() Rows { return &rowGroupRows{rowGroup: buf} } 254 255 // bufferWriter is an adapter for Buffer which implements both RowWriter and 256 // PageWriter to enable optimizations in CopyRows for types that support writing 257 // rows by copying whole pages instead of calling WriteRow repeatedly. 258 type bufferWriter struct{ buf *Buffer } 259 260 func (w bufferWriter) WriteRows(rows []Row) (int, error) { 261 return w.buf.WriteRows(rows) 262 } 263 264 func (w bufferWriter) WriteValues(values []Value) (int, error) { 265 return w.buf.columns[values[0].Column()].WriteValues(values) 266 } 267 268 func (w bufferWriter) WritePage(page Page) (int64, error) { 269 return CopyValues(w.buf.columns[page.Column()], page.Values()) 270 } 271 272 var ( 273 _ RowGroup = (*Buffer)(nil) 274 _ RowGroupWriter = (*Buffer)(nil) 275 _ sort.Interface = (*Buffer)(nil) 276 277 _ RowWriter = (*bufferWriter)(nil) 278 _ PageWriter = (*bufferWriter)(nil) 279 _ ValueWriter = (*bufferWriter)(nil) 280 )