github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_buffer_test.go (about) 1 //go:build go1.18 2 3 package parquet_test 4 5 import ( 6 "bytes" 7 "encoding/binary" 8 "errors" 9 "fmt" 10 "io" 11 "math/rand" 12 "reflect" 13 "sort" 14 "testing" 15 16 "github.com/segmentio/parquet-go" 17 "github.com/segmentio/parquet-go/encoding" 18 ) 19 20 func TestRowBuffer(t *testing.T) { 21 testRowBuffer[booleanColumn](t) 22 testRowBuffer[int32Column](t) 23 testRowBuffer[int64Column](t) 24 testRowBuffer[int96Column](t) 25 testRowBuffer[floatColumn](t) 26 testRowBuffer[doubleColumn](t) 27 testRowBuffer[byteArrayColumn](t) 28 testRowBuffer[fixedLenByteArrayColumn](t) 29 testRowBuffer[stringColumn](t) 30 testRowBuffer[indexedStringColumn](t) 31 testRowBuffer[uuidColumn](t) 32 testRowBuffer[timeColumn](t) 33 testRowBuffer[timeInMillisColumn](t) 34 testRowBuffer[mapColumn](t) 35 testRowBuffer[decimalColumn](t) 36 testRowBuffer[addressBook](t) 37 testRowBuffer[contact](t) 38 testRowBuffer[listColumn2](t) 39 testRowBuffer[listColumn1](t) 40 testRowBuffer[listColumn0](t) 41 testRowBuffer[nestedListColumn1](t) 42 testRowBuffer[nestedListColumn](t) 43 testRowBuffer[*contact](t) 44 testRowBuffer[paddedBooleanColumn](t) 45 testRowBuffer[optionalInt32Column](t) 46 testRowBuffer[repeatedInt32Column](t) 47 48 for _, test := range bufferTests { 49 t.Run(test.scenario, func(t *testing.T) { 50 for _, mod := range [...]struct { 51 scenario string 52 function func(parquet.Node) parquet.Node 53 }{ 54 {scenario: "optional", function: parquet.Optional}, 55 {scenario: "repeated", function: parquet.Repeated}, 56 {scenario: "required", function: parquet.Required}, 57 } { 58 t.Run(mod.scenario, func(t *testing.T) { 59 for _, ordering := range [...]struct { 60 scenario string 61 sorting parquet.SortingColumn 62 sortFunc func(parquet.Type, []parquet.Value) 63 }{ 64 {scenario: "unordered", sorting: nil, sortFunc: unordered}, 65 {scenario: "ascending", sorting: parquet.Ascending("data"), sortFunc: ascending}, 66 {scenario: "descending", sorting: parquet.Descending("data"), sortFunc: descending}, 67 } { 68 t.Run(ordering.scenario, func(t *testing.T) { 69 schema := parquet.NewSchema("test", parquet.Group{ 70 "data": mod.function(parquet.Leaf(test.typ)), 71 }) 72 73 options := []parquet.RowGroupOption{ 74 schema, 75 } 76 77 if ordering.sorting != nil { 78 options = append(options, 79 parquet.SortingRowGroupConfig( 80 parquet.SortingColumns(ordering.sorting), 81 ), 82 ) 83 } 84 85 content := new(bytes.Buffer) 86 buffer := parquet.NewRowBuffer[any](options...) 87 88 for _, values := range test.values { 89 t.Run("", func(t *testing.T) { 90 defer content.Reset() 91 defer buffer.Reset() 92 fields := schema.Fields() 93 testRowBufferAny(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc) 94 }) 95 } 96 }) 97 } 98 }) 99 } 100 }) 101 } 102 } 103 104 func testRowBuffer[Row any](t *testing.T) { 105 var model Row 106 t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) { 107 err := quickCheck(func(rows []Row) bool { 108 if len(rows) == 0 { 109 return true // TODO: fix support for parquet files with zero rows 110 } 111 if err := testRowBufferRows(rows); err != nil { 112 t.Error(err) 113 return false 114 } 115 return true 116 }) 117 if err != nil { 118 t.Error(err) 119 } 120 }) 121 } 122 123 func testRowBufferRows[Row any](rows []Row) error { 124 setNullPointers(rows) 125 buffer := parquet.NewRowBuffer[Row]() 126 _, err := buffer.Write(rows) 127 if err != nil { 128 return err 129 } 130 reader := parquet.NewGenericRowGroupReader[Row](buffer) 131 result := make([]Row, len(rows)) 132 n, err := reader.Read(result) 133 if err != nil && !errors.Is(err, io.EOF) { 134 return err 135 } 136 if n < len(rows) { 137 return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n) 138 } 139 if !reflect.DeepEqual(rows, result) { 140 return fmt.Errorf("rows mismatch:\nwant: %#v\ngot: %#v", rows, result) 141 } 142 return nil 143 } 144 145 func testRowBufferAny(t *testing.T, node parquet.Node, buffer *parquet.RowBuffer[any], encoding encoding.Encoding, values []any, sortFunc sortFunc) { 146 repetitionLevel := 0 147 definitionLevel := 0 148 if !node.Required() { 149 definitionLevel = 1 150 } 151 152 minValue := parquet.Value{} 153 maxValue := parquet.Value{} 154 batch := make([]parquet.Value, len(values)) 155 for i := range values { 156 batch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0) 157 } 158 159 for i := range batch { 160 _, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]}) 161 if err != nil { 162 t.Fatalf("writing value to row group: %v", err) 163 } 164 } 165 166 numRows := buffer.NumRows() 167 if numRows != int64(len(batch)) { 168 t.Fatalf("number of rows mismatch: want=%d got=%d", len(batch), numRows) 169 } 170 171 typ := node.Type() 172 for _, value := range batch { 173 if minValue.IsNull() || typ.Compare(value, minValue) < 0 { 174 minValue = value 175 } 176 if maxValue.IsNull() || typ.Compare(value, maxValue) > 0 { 177 maxValue = value 178 } 179 } 180 181 sortFunc(typ, batch) 182 sort.Sort(buffer) 183 184 pages := buffer.ColumnChunks()[0].Pages() 185 page, err := pages.ReadPage() 186 defer pages.Close() 187 188 if err == io.EOF { 189 if numRows != 0 { 190 t.Fatalf("no pages found in row buffer despite having %d rows", numRows) 191 } else { 192 return 193 } 194 } 195 196 numValues := page.NumValues() 197 if numValues != int64(len(batch)) { 198 t.Fatalf("number of values mistmatch: want=%d got=%d", len(batch), numValues) 199 } 200 201 numNulls := page.NumNulls() 202 if numNulls != 0 { 203 t.Fatalf("number of nulls mismatch: want=0 got=%d", numNulls) 204 } 205 206 min, max, hasBounds := page.Bounds() 207 if !hasBounds && numRows > 0 { 208 t.Fatal("page bounds are missing") 209 } 210 if !parquet.Equal(min, minValue) { 211 t.Fatalf("min value mismatch: want=%v got=%v", minValue, min) 212 } 213 if !parquet.Equal(max, maxValue) { 214 t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max) 215 } 216 217 // We write a single value per row, so num values = num rows for all pages 218 // including repeated ones, which makes it OK to slice the pages using the 219 // number of values as a proxy for the row indexes. 220 halfValues := numValues / 2 221 222 for _, test := range [...]struct { 223 scenario string 224 values []parquet.Value 225 reader parquet.ValueReader 226 }{ 227 {"page", batch, page.Values()}, 228 {"head", batch[:halfValues], page.Slice(0, halfValues).Values()}, 229 {"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()}, 230 } { 231 v := [1]parquet.Value{} 232 i := 0 233 234 for { 235 n, err := test.reader.ReadValues(v[:]) 236 if n > 0 { 237 if n != 1 { 238 t.Fatalf("reading value from %q reader returned the wrong count: want=1 got=%d", test.scenario, n) 239 } 240 if i < len(test.values) { 241 if !parquet.Equal(v[0], test.values[i]) { 242 t.Fatalf("%q value at index %d mismatches: want=%v got=%v", test.scenario, i, test.values[i], v[0]) 243 } 244 } 245 i++ 246 } 247 if err != nil { 248 if err == io.EOF { 249 break 250 } 251 t.Fatalf("reading value from %q reader: %v", test.scenario, err) 252 } 253 } 254 255 if i != len(test.values) { 256 t.Errorf("wrong number of values read from %q reader: want=%d got=%d", test.scenario, len(test.values), i) 257 } 258 } 259 } 260 261 func BenchmarkSortRowBuffer(b *testing.B) { 262 type Row struct { 263 I0 int64 264 I1 int64 265 I2 int64 266 I3 int64 267 I4 int64 268 I5 int64 269 I6 int64 270 I7 int64 271 I8 int64 272 I9 int64 273 ID [16]byte 274 } 275 276 buf := parquet.NewRowBuffer[Row]( 277 parquet.SortingRowGroupConfig( 278 parquet.SortingColumns( 279 parquet.Ascending("ID"), 280 ), 281 ), 282 ) 283 284 rows := make([]Row, 10e3) 285 prng := rand.New(rand.NewSource(0)) 286 287 for i := range rows { 288 binary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i)) 289 binary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i)) 290 } 291 292 buf.Write(rows) 293 b.ResetTimer() 294 295 for i := 0; i < b.N; i++ { 296 for j := 0; j < 10; j++ { 297 buf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows))) 298 } 299 300 sort.Sort(buf) 301 } 302 } 303 304 func BenchmarkMergeRowBuffers(b *testing.B) { 305 type Row struct { 306 ID int64 `parquet:"id"` 307 } 308 309 const ( 310 numBuffers = 100 311 numRowsPerBuffer = 10e3 312 ) 313 314 rows := [numBuffers][numRowsPerBuffer]Row{} 315 nextID := int64(0) 316 for i := 0; i < numRowsPerBuffer; i++ { 317 for j := 0; j < numBuffers; j++ { 318 rows[j][i].ID = nextID 319 nextID++ 320 } 321 } 322 323 options := []parquet.RowGroupOption{ 324 parquet.SortingRowGroupConfig( 325 parquet.SortingColumns( 326 parquet.Ascending("id"), 327 ), 328 ), 329 } 330 331 rowGroups := make([]parquet.RowGroup, numBuffers) 332 for i := range rowGroups { 333 buffer := parquet.NewRowBuffer[Row](options...) 334 buffer.Write(rows[i][:]) 335 rowGroups[i] = buffer 336 } 337 338 merge, err := parquet.MergeRowGroups(rowGroups, options...) 339 if err != nil { 340 b.Fatal(err) 341 } 342 343 b.ResetTimer() 344 345 for i := 0; i < b.N; i++ { 346 rows := merge.Rows() 347 _, err := parquet.CopyRows(discardRows{}, rows) 348 rows.Close() 349 if err != nil { 350 b.Fatal(err) 351 } 352 } 353 } 354 355 type discardRows struct{} 356 357 func (discardRows) WriteRows(rows []parquet.Row) (int, error) { 358 return len(rows), nil 359 }