github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/merge_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"math/rand"
     9  	"sort"
    10  	"testing"
    11  
    12  	"github.com/parquet-go/parquet-go"
    13  )
    14  
    15  const (
    16  	numRowGroups = 3
    17  	rowsPerGroup = benchmarkNumRows
    18  )
    19  
    20  type wrappedRowGroup struct {
    21  	parquet.RowGroup
    22  	rowsCallback func(parquet.Rows) parquet.Rows
    23  }
    24  
    25  func (r wrappedRowGroup) Rows() parquet.Rows {
    26  	return r.rowsCallback(r.RowGroup.Rows())
    27  }
    28  
    29  type wrappedRows struct {
    30  	parquet.Rows
    31  	closed bool
    32  }
    33  
    34  func (r *wrappedRows) Close() error {
    35  	r.closed = true
    36  	return r.Rows.Close()
    37  }
    38  
    39  func TestMergeRowGroups(t *testing.T) {
    40  	tests := []struct {
    41  		scenario string
    42  		options  []parquet.RowGroupOption
    43  		input    []parquet.RowGroup
    44  		output   parquet.RowGroup
    45  	}{
    46  		{
    47  			scenario: "no row groups",
    48  			options: []parquet.RowGroupOption{
    49  				parquet.SchemaOf(Person{}),
    50  			},
    51  			output: sortedRowGroup(
    52  				[]parquet.RowGroupOption{
    53  					parquet.SchemaOf(Person{}),
    54  				},
    55  			),
    56  		},
    57  
    58  		{
    59  			scenario: "a single row group",
    60  			input: []parquet.RowGroup{
    61  				sortedRowGroup(nil,
    62  					Person{FirstName: "some", LastName: "one", Age: 30},
    63  					Person{FirstName: "some", LastName: "one else", Age: 31},
    64  					Person{FirstName: "and", LastName: "you", Age: 32},
    65  				),
    66  			},
    67  			output: sortedRowGroup(nil,
    68  				Person{FirstName: "some", LastName: "one", Age: 30},
    69  				Person{FirstName: "some", LastName: "one else", Age: 31},
    70  				Person{FirstName: "and", LastName: "you", Age: 32},
    71  			),
    72  		},
    73  
    74  		{
    75  			scenario: "two row groups without ordering",
    76  			input: []parquet.RowGroup{
    77  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}),
    78  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}),
    79  			},
    80  			output: sortedRowGroup(nil,
    81  				Person{FirstName: "some", LastName: "one", Age: 30},
    82  				Person{FirstName: "some", LastName: "one else", Age: 31},
    83  			),
    84  		},
    85  
    86  		{
    87  			scenario: "three row groups without ordering",
    88  			input: []parquet.RowGroup{
    89  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}),
    90  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}),
    91  				sortedRowGroup(nil, Person{FirstName: "question", LastName: "answer", Age: 42}),
    92  			},
    93  			output: sortedRowGroup(nil,
    94  				Person{FirstName: "some", LastName: "one", Age: 30},
    95  				Person{FirstName: "some", LastName: "one else", Age: 31},
    96  				Person{FirstName: "question", LastName: "answer", Age: 42},
    97  			),
    98  		},
    99  
   100  		{
   101  			scenario: "row groups sorted by ascending last name",
   102  			options: []parquet.RowGroupOption{
   103  				parquet.SortingRowGroupConfig(
   104  					parquet.SortingColumns(
   105  						parquet.Ascending("LastName"),
   106  					),
   107  				),
   108  			},
   109  			input: []parquet.RowGroup{
   110  				sortedRowGroup(
   111  					[]parquet.RowGroupOption{
   112  						parquet.SortingRowGroupConfig(
   113  							parquet.SortingColumns(
   114  								parquet.Ascending("LastName"),
   115  							),
   116  						),
   117  					},
   118  					Person{FirstName: "Han", LastName: "Solo"},
   119  					Person{FirstName: "Luke", LastName: "Skywalker"},
   120  				),
   121  				sortedRowGroup(
   122  					[]parquet.RowGroupOption{
   123  						parquet.SortingRowGroupConfig(
   124  							parquet.SortingColumns(
   125  								parquet.Ascending("LastName"),
   126  							),
   127  						),
   128  					},
   129  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   130  				),
   131  			},
   132  			output: sortedRowGroup(nil,
   133  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   134  				Person{FirstName: "Luke", LastName: "Skywalker"},
   135  				Person{FirstName: "Han", LastName: "Solo"},
   136  			),
   137  		},
   138  		{
   139  			scenario: "reproduce issue #66, merging rows with an empty row group",
   140  			options: []parquet.RowGroupOption{
   141  				parquet.SortingRowGroupConfig(
   142  					parquet.SortingColumns(
   143  						parquet.Ascending("LastName"),
   144  					),
   145  				),
   146  			},
   147  			input: []parquet.RowGroup{
   148  				sortedRowGroup(
   149  					[]parquet.RowGroupOption{
   150  						parquet.SortingRowGroupConfig(
   151  							parquet.SortingColumns(
   152  								parquet.Ascending("LastName"),
   153  							),
   154  						),
   155  					},
   156  					Person{FirstName: "Han", LastName: "Solo"},
   157  				),
   158  
   159  				sortedRowGroup(
   160  					[]parquet.RowGroupOption{
   161  						parquet.SchemaOf(Person{}),
   162  						parquet.SortingRowGroupConfig(
   163  							parquet.SortingColumns(
   164  								parquet.Ascending("LastName"),
   165  							),
   166  						),
   167  					},
   168  				),
   169  				sortedRowGroup(
   170  					[]parquet.RowGroupOption{
   171  						parquet.SortingRowGroupConfig(
   172  							parquet.SortingColumns(
   173  								parquet.Ascending("LastName"),
   174  							),
   175  						),
   176  					},
   177  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   178  				),
   179  			},
   180  			output: sortedRowGroup(nil,
   181  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   182  				Person{FirstName: "Han", LastName: "Solo"},
   183  			),
   184  		},
   185  		{
   186  			scenario: "row groups sorted by descending last name",
   187  			options: []parquet.RowGroupOption{
   188  				parquet.SortingRowGroupConfig(
   189  					parquet.SortingColumns(
   190  						parquet.Descending("LastName"),
   191  					),
   192  				),
   193  			},
   194  			input: []parquet.RowGroup{
   195  				sortedRowGroup(
   196  					[]parquet.RowGroupOption{
   197  						parquet.SortingRowGroupConfig(
   198  							parquet.SortingColumns(
   199  								parquet.Descending("LastName"),
   200  							),
   201  						),
   202  					},
   203  					Person{FirstName: "Han", LastName: "Solo"},
   204  					Person{FirstName: "Luke", LastName: "Skywalker"},
   205  				),
   206  				sortedRowGroup(
   207  					[]parquet.RowGroupOption{
   208  						parquet.SortingRowGroupConfig(
   209  							parquet.SortingColumns(
   210  								parquet.Descending("LastName"),
   211  							),
   212  						),
   213  					},
   214  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   215  				),
   216  			},
   217  			output: sortedRowGroup(nil,
   218  				Person{FirstName: "Han", LastName: "Solo"},
   219  				Person{FirstName: "Luke", LastName: "Skywalker"},
   220  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   221  			),
   222  		},
   223  
   224  		{
   225  			scenario: "row groups sorted by ascending last and first name",
   226  			options: []parquet.RowGroupOption{
   227  				parquet.SortingRowGroupConfig(
   228  					parquet.SortingColumns(
   229  						parquet.Ascending("LastName"),
   230  						parquet.Ascending("FirstName"),
   231  					),
   232  				),
   233  			},
   234  			input: []parquet.RowGroup{
   235  				sortedRowGroup(
   236  					[]parquet.RowGroupOption{
   237  						parquet.SortingRowGroupConfig(
   238  							parquet.SortingColumns(
   239  								parquet.Ascending("LastName"),
   240  								parquet.Ascending("FirstName"),
   241  							),
   242  						),
   243  					},
   244  					Person{FirstName: "Luke", LastName: "Skywalker"},
   245  					Person{FirstName: "Han", LastName: "Solo"},
   246  				),
   247  				sortedRowGroup(
   248  					[]parquet.RowGroupOption{
   249  						parquet.SortingRowGroupConfig(
   250  							parquet.SortingColumns(
   251  								parquet.Ascending("LastName"),
   252  								parquet.Ascending("FirstName"),
   253  							),
   254  						),
   255  					},
   256  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   257  					Person{FirstName: "Anakin", LastName: "Skywalker"},
   258  				),
   259  			},
   260  			output: sortedRowGroup(nil,
   261  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   262  				Person{FirstName: "Anakin", LastName: "Skywalker"},
   263  				Person{FirstName: "Luke", LastName: "Skywalker"},
   264  				Person{FirstName: "Han", LastName: "Solo"},
   265  			),
   266  		},
   267  
   268  		{
   269  			scenario: "row groups with conversion to a different schema",
   270  			options: []parquet.RowGroupOption{
   271  				parquet.SchemaOf(LastNameOnly{}),
   272  				parquet.SortingRowGroupConfig(
   273  					parquet.SortingColumns(
   274  						parquet.Ascending("LastName"),
   275  					),
   276  				),
   277  			},
   278  			input: []parquet.RowGroup{
   279  				sortedRowGroup(
   280  					[]parquet.RowGroupOption{
   281  						parquet.SortingRowGroupConfig(
   282  							parquet.SortingColumns(
   283  								parquet.Ascending("LastName"),
   284  							),
   285  						),
   286  					},
   287  					Person{FirstName: "Han", LastName: "Solo"},
   288  					Person{FirstName: "Luke", LastName: "Skywalker"},
   289  				),
   290  				sortedRowGroup(
   291  					[]parquet.RowGroupOption{
   292  						parquet.SortingRowGroupConfig(
   293  							parquet.SortingColumns(
   294  								parquet.Ascending("LastName"),
   295  							),
   296  						),
   297  					},
   298  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   299  					Person{FirstName: "Anakin", LastName: "Skywalker"},
   300  				),
   301  			},
   302  			output: sortedRowGroup(
   303  				[]parquet.RowGroupOption{
   304  					parquet.SortingRowGroupConfig(
   305  						parquet.SortingColumns(
   306  							parquet.Ascending("LastName"),
   307  						),
   308  					),
   309  				},
   310  				LastNameOnly{LastName: "Solo"},
   311  				LastNameOnly{LastName: "Skywalker"},
   312  				LastNameOnly{LastName: "Skywalker"},
   313  				LastNameOnly{LastName: "Kenobi"},
   314  			),
   315  		},
   316  	}
   317  
   318  	for _, adapter := range []struct {
   319  		scenario string
   320  		function func(parquet.RowGroup) parquet.RowGroup
   321  	}{
   322  		{scenario: "buffer", function: selfRowGroup},
   323  		{scenario: "file", function: fileRowGroup},
   324  	} {
   325  		t.Run(adapter.scenario, func(t *testing.T) {
   326  			for _, test := range tests {
   327  				t.Run(test.scenario, func(t *testing.T) {
   328  					input := make([]parquet.RowGroup, len(test.input))
   329  					for i := range test.input {
   330  						input[i] = adapter.function(test.input[i])
   331  					}
   332  
   333  					merged, err := parquet.MergeRowGroups(test.input, test.options...)
   334  					if err != nil {
   335  						t.Fatal(err)
   336  					}
   337  					if merged.NumRows() != test.output.NumRows() {
   338  						t.Fatalf("the number of rows mismatch: want=%d got=%d", merged.NumRows(), test.output.NumRows())
   339  					}
   340  					if merged.Schema() != test.output.Schema() {
   341  						t.Fatalf("the row group schemas mismatch:\n%v\n%v", test.output.Schema(), merged.Schema())
   342  					}
   343  
   344  					options := []parquet.RowGroupOption{parquet.SchemaOf(Person{})}
   345  					options = append(options, test.options...)
   346  					// We test two views of the resulting row group: the one originally
   347  					// returned by MergeRowGroups, and one where the merged row group
   348  					// has been copied into a new buffer. The intent is to exercise both
   349  					// the row-by-row read as well as optimized code paths when CopyRows
   350  					// bypasses the ReadRow/WriteRow calls and the row group is written
   351  					// directly to the buffer by calling WriteRowsTo/WriteRowGroup.
   352  					mergedCopy := parquet.NewBuffer(options...)
   353  
   354  					totalRows := test.output.NumRows()
   355  					numRows, err := copyRowsAndClose(mergedCopy, merged.Rows())
   356  					if err != nil {
   357  						t.Fatal(err)
   358  					}
   359  					if numRows != totalRows {
   360  						t.Fatalf("wrong number of rows copied: want=%d got=%d", totalRows, numRows)
   361  					}
   362  
   363  					for _, merge := range []struct {
   364  						scenario string
   365  						rowGroup parquet.RowGroup
   366  					}{
   367  						{scenario: "self", rowGroup: merged},
   368  						{scenario: "copy", rowGroup: mergedCopy},
   369  					} {
   370  						t.Run(merge.scenario, func(t *testing.T) {
   371  							var expectedRows = test.output.Rows()
   372  							var mergedRows = merge.rowGroup.Rows()
   373  							var row1 = make([]parquet.Row, 1)
   374  							var row2 = make([]parquet.Row, 1)
   375  							var numRows int64
   376  
   377  							defer expectedRows.Close()
   378  							defer mergedRows.Close()
   379  
   380  							for {
   381  								_, err1 := expectedRows.ReadRows(row1)
   382  								n, err2 := mergedRows.ReadRows(row2)
   383  
   384  								if err1 != err2 {
   385  									// ReadRows may or may not return io.EOF
   386  									// when it reads the last row, so we test
   387  									// that the reference RowReader has also
   388  									// reached the end.
   389  									if err1 == nil && err2 == io.EOF {
   390  										_, err1 = expectedRows.ReadRows(row1[:0])
   391  									}
   392  									if err1 != io.EOF {
   393  										t.Fatalf("errors mismatched while comparing row %d/%d: want=%v got=%v", numRows, totalRows, err1, err2)
   394  									}
   395  								}
   396  
   397  								if n != 0 {
   398  									if !row1[0].Equal(row2[0]) {
   399  										t.Errorf("row at index %d/%d mismatch: want=%+v got=%+v", numRows, totalRows, row1[0], row2[0])
   400  									}
   401  									numRows++
   402  								}
   403  
   404  								if err1 != nil {
   405  									break
   406  								}
   407  							}
   408  
   409  							if numRows != totalRows {
   410  								t.Errorf("expected to read %d rows but %d were found", totalRows, numRows)
   411  							}
   412  						})
   413  					}
   414  
   415  				})
   416  			}
   417  		})
   418  	}
   419  }
   420  
   421  func TestMergeRowGroupsCursorsAreClosed(t *testing.T) {
   422  	type model struct {
   423  		A int
   424  	}
   425  
   426  	schema := parquet.SchemaOf(model{})
   427  	options := []parquet.RowGroupOption{
   428  		parquet.SortingRowGroupConfig(
   429  			parquet.SortingColumns(
   430  				parquet.Ascending(schema.Columns()[0]...),
   431  			),
   432  		),
   433  	}
   434  
   435  	prng := rand.New(rand.NewSource(0))
   436  	rowGroups := make([]parquet.RowGroup, numRowGroups)
   437  	rows := make([]*wrappedRows, 0, numRowGroups)
   438  
   439  	for i := range rowGroups {
   440  		rowGroups[i] = wrappedRowGroup{
   441  			RowGroup: sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, model{})...),
   442  			rowsCallback: func(r parquet.Rows) parquet.Rows {
   443  				wrapped := &wrappedRows{Rows: r}
   444  				rows = append(rows, wrapped)
   445  				return wrapped
   446  			},
   447  		}
   448  	}
   449  
   450  	m, err := parquet.MergeRowGroups(rowGroups, options...)
   451  	if err != nil {
   452  		t.Fatal(err)
   453  	}
   454  	func() {
   455  		mergedRows := m.Rows()
   456  		defer mergedRows.Close()
   457  
   458  		// Add 1 more slot to the buffer to force an io.EOF on the first read.
   459  		rbuf := make([]parquet.Row, (numRowGroups*rowsPerGroup)+1)
   460  		if _, err := mergedRows.ReadRows(rbuf); !errors.Is(err, io.EOF) {
   461  			t.Fatal(err)
   462  		}
   463  	}()
   464  
   465  	for i, wrapped := range rows {
   466  		if !wrapped.closed {
   467  			t.Fatalf("RowGroup %d not closed", i)
   468  		}
   469  	}
   470  }
   471  
   472  func TestMergeRowGroupsSeekToRow(t *testing.T) {
   473  	type model struct {
   474  		A int
   475  	}
   476  
   477  	schema := parquet.SchemaOf(model{})
   478  	options := []parquet.RowGroupOption{
   479  		parquet.SortingRowGroupConfig(
   480  			parquet.SortingColumns(
   481  				parquet.Ascending(schema.Columns()[0]...),
   482  			),
   483  		),
   484  	}
   485  
   486  	rowGroups := make([]parquet.RowGroup, numRowGroups)
   487  
   488  	counter := 0
   489  	for i := range rowGroups {
   490  		rows := make([]interface{}, 0, rowsPerGroup)
   491  		for j := 0; j < rowsPerGroup; j++ {
   492  			rows = append(rows, model{A: counter})
   493  			counter++
   494  		}
   495  		rowGroups[i] = sortedRowGroup(options, rows...)
   496  	}
   497  
   498  	m, err := parquet.MergeRowGroups(rowGroups, options...)
   499  	if err != nil {
   500  		t.Fatal(err)
   501  	}
   502  
   503  	func() {
   504  		mergedRows := m.Rows()
   505  		defer mergedRows.Close()
   506  
   507  		rbuf := make([]parquet.Row, 1)
   508  		cursor := int64(0)
   509  		for {
   510  			if err := mergedRows.SeekToRow(cursor); err != nil {
   511  				t.Fatal(err)
   512  			}
   513  
   514  			if _, err := mergedRows.ReadRows(rbuf); err != nil {
   515  				if errors.Is(err, io.EOF) {
   516  					break
   517  				}
   518  				t.Fatal(err)
   519  			}
   520  			v := model{}
   521  			if err := schema.Reconstruct(&v, rbuf[0]); err != nil {
   522  				t.Fatal(err)
   523  			}
   524  			if v.A != int(cursor) {
   525  				t.Fatalf("expected value %d, got %d", cursor, v.A)
   526  			}
   527  
   528  			cursor++
   529  		}
   530  	}()
   531  }
   532  
   533  func BenchmarkMergeRowGroups(b *testing.B) {
   534  	for _, test := range readerTests {
   535  		b.Run(test.scenario, func(b *testing.B) {
   536  			schema := parquet.SchemaOf(test.model)
   537  
   538  			options := []parquet.RowGroupOption{
   539  				parquet.SortingRowGroupConfig(
   540  					parquet.SortingColumns(
   541  						parquet.Ascending(schema.Columns()[0]...),
   542  					),
   543  				),
   544  			}
   545  
   546  			prng := rand.New(rand.NewSource(0))
   547  			rowGroups := make([]parquet.RowGroup, numRowGroups)
   548  
   549  			for i := range rowGroups {
   550  				rowGroups[i] = sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, test.model)...)
   551  			}
   552  
   553  			for n := 1; n <= numRowGroups; n++ {
   554  				b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) {
   555  					mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...)
   556  					if err != nil {
   557  						b.Fatal(err)
   558  					}
   559  
   560  					rows := mergedRowGroup.Rows()
   561  					rbuf := make([]parquet.Row, benchmarkRowsPerStep)
   562  					defer func() { rows.Close() }()
   563  
   564  					benchmarkRowsPerSecond(b, func() int {
   565  						n, err := rows.ReadRows(rbuf)
   566  						if err != nil {
   567  							if !errors.Is(err, io.EOF) {
   568  								b.Fatal(err)
   569  							}
   570  							rows.Close()
   571  							rows = mergedRowGroup.Rows()
   572  						}
   573  						return n
   574  					})
   575  				})
   576  			}
   577  		})
   578  	}
   579  }
   580  
   581  func BenchmarkMergeFiles(b *testing.B) {
   582  	rowGroupBuffers := make([]bytes.Buffer, numRowGroups)
   583  
   584  	for _, test := range readerTests {
   585  		b.Run(test.scenario, func(b *testing.B) {
   586  			schema := parquet.SchemaOf(test.model)
   587  
   588  			sortingOptions := []parquet.SortingOption{
   589  				parquet.SortingColumns(
   590  					parquet.Ascending(schema.Columns()[0]...),
   591  				),
   592  			}
   593  
   594  			options := []parquet.RowGroupOption{
   595  				schema,
   596  				parquet.SortingRowGroupConfig(
   597  					sortingOptions...,
   598  				),
   599  			}
   600  
   601  			buffer := parquet.NewBuffer(options...)
   602  
   603  			prng := rand.New(rand.NewSource(0))
   604  			files := make([]*parquet.File, numRowGroups)
   605  			rowGroups := make([]parquet.RowGroup, numRowGroups)
   606  
   607  			for i := range files {
   608  				for _, row := range randomRowsOf(prng, rowsPerGroup, test.model) {
   609  					buffer.Write(row)
   610  				}
   611  				sort.Sort(buffer)
   612  				rowGroupBuffers[i].Reset()
   613  				writer := parquet.NewWriter(&rowGroupBuffers[i],
   614  					schema,
   615  					parquet.SortingWriterConfig(
   616  						sortingOptions...,
   617  					),
   618  				)
   619  				_, err := copyRowsAndClose(writer, buffer.Rows())
   620  				if err != nil {
   621  					b.Fatal(err)
   622  				}
   623  				if err := writer.Close(); err != nil {
   624  					b.Fatal(err)
   625  				}
   626  				r := bytes.NewReader(rowGroupBuffers[i].Bytes())
   627  				f, err := parquet.OpenFile(r, r.Size())
   628  				if err != nil {
   629  					b.Fatal(err)
   630  				}
   631  				files[i], rowGroups[i] = f, f.RowGroups()[0]
   632  			}
   633  
   634  			for n := 1; n <= numRowGroups; n++ {
   635  				b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) {
   636  					mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...)
   637  					if err != nil {
   638  						b.Fatal(err)
   639  					}
   640  
   641  					rows := mergedRowGroup.Rows()
   642  					rbuf := make([]parquet.Row, benchmarkRowsPerStep)
   643  					defer func() { rows.Close() }()
   644  
   645  					benchmarkRowsPerSecond(b, func() int {
   646  						n, err := rows.ReadRows(rbuf)
   647  						if err != nil {
   648  							if !errors.Is(err, io.EOF) {
   649  								b.Fatal(err)
   650  							}
   651  							rows.Close()
   652  							rows = mergedRowGroup.Rows()
   653  						}
   654  						return n
   655  					})
   656  
   657  					totalSize := int64(0)
   658  					for _, f := range files[:n] {
   659  						totalSize += f.Size()
   660  					}
   661  				})
   662  			}
   663  		})
   664  	}
   665  }