github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/row_buffer_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "errors" 7 "fmt" 8 "io" 9 "math/rand" 10 "reflect" 11 "sort" 12 "testing" 13 14 "github.com/parquet-go/parquet-go" 15 "github.com/parquet-go/parquet-go/encoding" 16 ) 17 18 func TestRowBuffer(t *testing.T) { 19 testRowBuffer[booleanColumn](t) 20 testRowBuffer[int32Column](t) 21 testRowBuffer[int64Column](t) 22 testRowBuffer[int96Column](t) 23 testRowBuffer[floatColumn](t) 24 testRowBuffer[doubleColumn](t) 25 testRowBuffer[byteArrayColumn](t) 26 testRowBuffer[fixedLenByteArrayColumn](t) 27 testRowBuffer[stringColumn](t) 28 testRowBuffer[indexedStringColumn](t) 29 testRowBuffer[uuidColumn](t) 30 testRowBuffer[timeColumn](t) 31 testRowBuffer[timeInMillisColumn](t) 32 testRowBuffer[mapColumn](t) 33 testRowBuffer[decimalColumn](t) 34 testRowBuffer[addressBook](t) 35 testRowBuffer[contact](t) 36 testRowBuffer[listColumn2](t) 37 testRowBuffer[listColumn1](t) 38 testRowBuffer[listColumn0](t) 39 testRowBuffer[nestedListColumn1](t) 40 testRowBuffer[nestedListColumn](t) 41 testRowBuffer[*contact](t) 42 testRowBuffer[paddedBooleanColumn](t) 43 testRowBuffer[optionalInt32Column](t) 44 testRowBuffer[repeatedInt32Column](t) 45 46 for _, test := range bufferTests { 47 t.Run(test.scenario, func(t *testing.T) { 48 for _, mod := range [...]struct { 49 scenario string 50 function func(parquet.Node) parquet.Node 51 }{ 52 {scenario: "optional", function: parquet.Optional}, 53 {scenario: "repeated", function: parquet.Repeated}, 54 {scenario: "required", function: parquet.Required}, 55 } { 56 t.Run(mod.scenario, func(t *testing.T) { 57 for _, ordering := range [...]struct { 58 scenario string 59 sorting parquet.SortingColumn 60 sortFunc func(parquet.Type, []parquet.Value) 61 }{ 62 {scenario: "unordered", sorting: nil, sortFunc: unordered}, 63 {scenario: "ascending", sorting: parquet.Ascending("data"), sortFunc: ascending}, 64 {scenario: "descending", sorting: parquet.Descending("data"), sortFunc: descending}, 65 } { 66 t.Run(ordering.scenario, func(t *testing.T) { 67 schema := parquet.NewSchema("test", parquet.Group{ 68 "data": mod.function(parquet.Leaf(test.typ)), 69 }) 70 71 options := []parquet.RowGroupOption{ 72 schema, 73 } 74 75 if ordering.sorting != nil { 76 options = append(options, 77 parquet.SortingRowGroupConfig( 78 parquet.SortingColumns(ordering.sorting), 79 ), 80 ) 81 } 82 83 content := new(bytes.Buffer) 84 buffer := parquet.NewRowBuffer[any](options...) 85 86 for _, values := range test.values { 87 t.Run("", func(t *testing.T) { 88 defer content.Reset() 89 defer buffer.Reset() 90 fields := schema.Fields() 91 testRowBufferAny(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc) 92 }) 93 } 94 }) 95 } 96 }) 97 } 98 }) 99 } 100 } 101 102 func testRowBuffer[Row any](t *testing.T) { 103 var model Row 104 t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) { 105 err := quickCheck(func(rows []Row) bool { 106 if len(rows) == 0 { 107 return true // TODO: fix support for parquet files with zero rows 108 } 109 if err := testRowBufferRows(rows); err != nil { 110 t.Error(err) 111 return false 112 } 113 return true 114 }) 115 if err != nil { 116 t.Error(err) 117 } 118 }) 119 } 120 121 func testRowBufferRows[Row any](rows []Row) error { 122 setNullPointers(rows) 123 buffer := parquet.NewRowBuffer[Row]() 124 _, err := buffer.Write(rows) 125 if err != nil { 126 return err 127 } 128 reader := parquet.NewGenericRowGroupReader[Row](buffer) 129 result := make([]Row, len(rows)) 130 n, err := reader.Read(result) 131 if err != nil && !errors.Is(err, io.EOF) { 132 return err 133 } 134 if n < len(rows) { 135 return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n) 136 } 137 if !reflect.DeepEqual(rows, result) { 138 return fmt.Errorf("rows mismatch:\nwant: %#v\ngot: %#v", rows, result) 139 } 140 return nil 141 } 142 143 func testRowBufferAny(t *testing.T, node parquet.Node, buffer *parquet.RowBuffer[any], encoding encoding.Encoding, values []any, sortFunc sortFunc) { 144 repetitionLevel := 0 145 definitionLevel := 0 146 if !node.Required() { 147 definitionLevel = 1 148 } 149 150 minValue := parquet.Value{} 151 maxValue := parquet.Value{} 152 batch := make([]parquet.Value, len(values)) 153 for i := range values { 154 batch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0) 155 } 156 157 for i := range batch { 158 _, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]}) 159 if err != nil { 160 t.Fatalf("writing value to row group: %v", err) 161 } 162 } 163 164 numRows := buffer.NumRows() 165 if numRows != int64(len(batch)) { 166 t.Fatalf("number of rows mismatch: want=%d got=%d", len(batch), numRows) 167 } 168 169 typ := node.Type() 170 for _, value := range batch { 171 if minValue.IsNull() || typ.Compare(value, minValue) < 0 { 172 minValue = value 173 } 174 if maxValue.IsNull() || typ.Compare(value, maxValue) > 0 { 175 maxValue = value 176 } 177 } 178 179 sortFunc(typ, batch) 180 sort.Sort(buffer) 181 182 pages := buffer.ColumnChunks()[0].Pages() 183 page, err := pages.ReadPage() 184 defer pages.Close() 185 186 if err == io.EOF { 187 if numRows != 0 { 188 t.Fatalf("no pages found in row buffer despite having %d rows", numRows) 189 } else { 190 return 191 } 192 } 193 194 numValues := page.NumValues() 195 if numValues != int64(len(batch)) { 196 t.Fatalf("number of values mistmatch: want=%d got=%d", len(batch), numValues) 197 } 198 199 numNulls := page.NumNulls() 200 if numNulls != 0 { 201 t.Fatalf("number of nulls mismatch: want=0 got=%d", numNulls) 202 } 203 204 min, max, hasBounds := page.Bounds() 205 if !hasBounds && numRows > 0 { 206 t.Fatal("page bounds are missing") 207 } 208 if !parquet.Equal(min, minValue) { 209 t.Fatalf("min value mismatch: want=%v got=%v", minValue, min) 210 } 211 if !parquet.Equal(max, maxValue) { 212 t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max) 213 } 214 215 // We write a single value per row, so num values = num rows for all pages 216 // including repeated ones, which makes it OK to slice the pages using the 217 // number of values as a proxy for the row indexes. 218 halfValues := numValues / 2 219 220 for _, test := range [...]struct { 221 scenario string 222 values []parquet.Value 223 reader parquet.ValueReader 224 }{ 225 {"page", batch, page.Values()}, 226 {"head", batch[:halfValues], page.Slice(0, halfValues).Values()}, 227 {"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()}, 228 } { 229 v := [1]parquet.Value{} 230 i := 0 231 232 for { 233 n, err := test.reader.ReadValues(v[:]) 234 if n > 0 { 235 if n != 1 { 236 t.Fatalf("reading value from %q reader returned the wrong count: want=1 got=%d", test.scenario, n) 237 } 238 if i < len(test.values) { 239 if !parquet.Equal(v[0], test.values[i]) { 240 t.Fatalf("%q value at index %d mismatches: want=%v got=%v", test.scenario, i, test.values[i], v[0]) 241 } 242 } 243 i++ 244 } 245 if err != nil { 246 if err == io.EOF { 247 break 248 } 249 t.Fatalf("reading value from %q reader: %v", test.scenario, err) 250 } 251 } 252 253 if i != len(test.values) { 254 t.Errorf("wrong number of values read from %q reader: want=%d got=%d", test.scenario, len(test.values), i) 255 } 256 } 257 } 258 259 func BenchmarkSortRowBuffer(b *testing.B) { 260 type Row struct { 261 I0 int64 262 I1 int64 263 I2 int64 264 I3 int64 265 I4 int64 266 I5 int64 267 I6 int64 268 I7 int64 269 I8 int64 270 I9 int64 271 ID [16]byte 272 } 273 274 buf := parquet.NewRowBuffer[Row]( 275 parquet.SortingRowGroupConfig( 276 parquet.SortingColumns( 277 parquet.Ascending("ID"), 278 ), 279 ), 280 ) 281 282 rows := make([]Row, 10e3) 283 prng := rand.New(rand.NewSource(0)) 284 285 for i := range rows { 286 binary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i)) 287 binary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i)) 288 } 289 290 buf.Write(rows) 291 b.ResetTimer() 292 293 for i := 0; i < b.N; i++ { 294 for j := 0; j < 10; j++ { 295 buf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows))) 296 } 297 298 sort.Sort(buf) 299 } 300 } 301 302 func BenchmarkMergeRowBuffers(b *testing.B) { 303 type Row struct { 304 ID int64 `parquet:"id"` 305 } 306 307 const ( 308 numBuffers = 100 309 numRowsPerBuffer = 10e3 310 ) 311 312 rows := [numBuffers][numRowsPerBuffer]Row{} 313 nextID := int64(0) 314 for i := 0; i < numRowsPerBuffer; i++ { 315 for j := 0; j < numBuffers; j++ { 316 rows[j][i].ID = nextID 317 nextID++ 318 } 319 } 320 321 options := []parquet.RowGroupOption{ 322 parquet.SortingRowGroupConfig( 323 parquet.SortingColumns( 324 parquet.Ascending("id"), 325 ), 326 ), 327 } 328 329 rowGroups := make([]parquet.RowGroup, numBuffers) 330 for i := range rowGroups { 331 buffer := parquet.NewRowBuffer[Row](options...) 332 buffer.Write(rows[i][:]) 333 rowGroups[i] = buffer 334 } 335 336 merge, err := parquet.MergeRowGroups(rowGroups, options...) 337 if err != nil { 338 b.Fatal(err) 339 } 340 341 b.ResetTimer() 342 343 for i := 0; i < b.N; i++ { 344 rows := merge.Rows() 345 _, err := parquet.CopyRows(discardRows{}, rows) 346 rows.Close() 347 if err != nil { 348 b.Fatal(err) 349 } 350 } 351 } 352 353 type discardRows struct{} 354 355 func (discardRows) WriteRows(rows []parquet.Row) (int, error) { 356 return len(rows), nil 357 }