github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/merge_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"math/rand"
     9  	"sort"
    10  	"testing"
    11  
    12  	"github.com/segmentio/parquet-go"
    13  )
    14  
    15  const (
    16  	numRowGroups = 3
    17  	rowsPerGroup = benchmarkNumRows
    18  )
    19  
    20  type wrappedRowGroup struct {
    21  	parquet.RowGroup
    22  	rowsCallback func(parquet.Rows) parquet.Rows
    23  }
    24  
    25  func (r wrappedRowGroup) Rows() parquet.Rows {
    26  	return r.rowsCallback(r.RowGroup.Rows())
    27  }
    28  
    29  type wrappedRows struct {
    30  	parquet.Rows
    31  	closed bool
    32  }
    33  
    34  func (r *wrappedRows) Close() error {
    35  	r.closed = true
    36  	return r.Rows.Close()
    37  }
    38  
    39  func TestMergeRowGroups(t *testing.T) {
    40  	tests := []struct {
    41  		scenario string
    42  		options  []parquet.RowGroupOption
    43  		input    []parquet.RowGroup
    44  		output   parquet.RowGroup
    45  	}{
    46  		{
    47  			scenario: "no row groups",
    48  			options: []parquet.RowGroupOption{
    49  				parquet.SchemaOf(Person{}),
    50  			},
    51  			output: sortedRowGroup(
    52  				[]parquet.RowGroupOption{
    53  					parquet.SchemaOf(Person{}),
    54  				},
    55  			),
    56  		},
    57  
    58  		{
    59  			scenario: "a single row group",
    60  			input: []parquet.RowGroup{
    61  				sortedRowGroup(nil,
    62  					Person{FirstName: "some", LastName: "one", Age: 30},
    63  					Person{FirstName: "some", LastName: "one else", Age: 31},
    64  					Person{FirstName: "and", LastName: "you", Age: 32},
    65  				),
    66  			},
    67  			output: sortedRowGroup(nil,
    68  				Person{FirstName: "some", LastName: "one", Age: 30},
    69  				Person{FirstName: "some", LastName: "one else", Age: 31},
    70  				Person{FirstName: "and", LastName: "you", Age: 32},
    71  			),
    72  		},
    73  
    74  		{
    75  			scenario: "two row groups without ordering",
    76  			input: []parquet.RowGroup{
    77  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}),
    78  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}),
    79  			},
    80  			output: sortedRowGroup(nil,
    81  				Person{FirstName: "some", LastName: "one", Age: 30},
    82  				Person{FirstName: "some", LastName: "one else", Age: 31},
    83  			),
    84  		},
    85  
    86  		{
    87  			scenario: "three row groups without ordering",
    88  			input: []parquet.RowGroup{
    89  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}),
    90  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}),
    91  				sortedRowGroup(nil, Person{FirstName: "question", LastName: "answer", Age: 42}),
    92  			},
    93  			output: sortedRowGroup(nil,
    94  				Person{FirstName: "some", LastName: "one", Age: 30},
    95  				Person{FirstName: "some", LastName: "one else", Age: 31},
    96  				Person{FirstName: "question", LastName: "answer", Age: 42},
    97  			),
    98  		},
    99  
   100  		{
   101  			scenario: "row groups sorted by ascending last name",
   102  			options: []parquet.RowGroupOption{
   103  				parquet.SortingRowGroupConfig(
   104  					parquet.SortingColumns(
   105  						parquet.Ascending("LastName"),
   106  					),
   107  				),
   108  			},
   109  			input: []parquet.RowGroup{
   110  				sortedRowGroup(
   111  					[]parquet.RowGroupOption{
   112  						parquet.SortingRowGroupConfig(
   113  							parquet.SortingColumns(
   114  								parquet.Ascending("LastName"),
   115  							),
   116  						),
   117  					},
   118  					Person{FirstName: "Han", LastName: "Solo"},
   119  					Person{FirstName: "Luke", LastName: "Skywalker"},
   120  				),
   121  				sortedRowGroup(
   122  					[]parquet.RowGroupOption{
   123  						parquet.SortingRowGroupConfig(
   124  							parquet.SortingColumns(
   125  								parquet.Ascending("LastName"),
   126  							),
   127  						),
   128  					},
   129  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   130  				),
   131  			},
   132  			output: sortedRowGroup(nil,
   133  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   134  				Person{FirstName: "Luke", LastName: "Skywalker"},
   135  				Person{FirstName: "Han", LastName: "Solo"},
   136  			),
   137  		},
   138  
   139  		{
   140  			scenario: "row groups sorted by descending last name",
   141  			options: []parquet.RowGroupOption{
   142  				parquet.SortingRowGroupConfig(
   143  					parquet.SortingColumns(
   144  						parquet.Descending("LastName"),
   145  					),
   146  				),
   147  			},
   148  			input: []parquet.RowGroup{
   149  				sortedRowGroup(
   150  					[]parquet.RowGroupOption{
   151  						parquet.SortingRowGroupConfig(
   152  							parquet.SortingColumns(
   153  								parquet.Descending("LastName"),
   154  							),
   155  						),
   156  					},
   157  					Person{FirstName: "Han", LastName: "Solo"},
   158  					Person{FirstName: "Luke", LastName: "Skywalker"},
   159  				),
   160  				sortedRowGroup(
   161  					[]parquet.RowGroupOption{
   162  						parquet.SortingRowGroupConfig(
   163  							parquet.SortingColumns(
   164  								parquet.Descending("LastName"),
   165  							),
   166  						),
   167  					},
   168  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   169  				),
   170  			},
   171  			output: sortedRowGroup(nil,
   172  				Person{FirstName: "Han", LastName: "Solo"},
   173  				Person{FirstName: "Luke", LastName: "Skywalker"},
   174  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   175  			),
   176  		},
   177  
   178  		{
   179  			scenario: "row groups sorted by ascending last and first name",
   180  			options: []parquet.RowGroupOption{
   181  				parquet.SortingRowGroupConfig(
   182  					parquet.SortingColumns(
   183  						parquet.Ascending("LastName"),
   184  						parquet.Ascending("FirstName"),
   185  					),
   186  				),
   187  			},
   188  			input: []parquet.RowGroup{
   189  				sortedRowGroup(
   190  					[]parquet.RowGroupOption{
   191  						parquet.SortingRowGroupConfig(
   192  							parquet.SortingColumns(
   193  								parquet.Ascending("LastName"),
   194  								parquet.Ascending("FirstName"),
   195  							),
   196  						),
   197  					},
   198  					Person{FirstName: "Luke", LastName: "Skywalker"},
   199  					Person{FirstName: "Han", LastName: "Solo"},
   200  				),
   201  				sortedRowGroup(
   202  					[]parquet.RowGroupOption{
   203  						parquet.SortingRowGroupConfig(
   204  							parquet.SortingColumns(
   205  								parquet.Ascending("LastName"),
   206  								parquet.Ascending("FirstName"),
   207  							),
   208  						),
   209  					},
   210  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   211  					Person{FirstName: "Anakin", LastName: "Skywalker"},
   212  				),
   213  			},
   214  			output: sortedRowGroup(nil,
   215  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   216  				Person{FirstName: "Anakin", LastName: "Skywalker"},
   217  				Person{FirstName: "Luke", LastName: "Skywalker"},
   218  				Person{FirstName: "Han", LastName: "Solo"},
   219  			),
   220  		},
   221  
   222  		{
   223  			scenario: "row groups with conversion to a different schema",
   224  			options: []parquet.RowGroupOption{
   225  				parquet.SchemaOf(LastNameOnly{}),
   226  				parquet.SortingRowGroupConfig(
   227  					parquet.SortingColumns(
   228  						parquet.Ascending("LastName"),
   229  					),
   230  				),
   231  			},
   232  			input: []parquet.RowGroup{
   233  				sortedRowGroup(
   234  					[]parquet.RowGroupOption{
   235  						parquet.SortingRowGroupConfig(
   236  							parquet.SortingColumns(
   237  								parquet.Ascending("LastName"),
   238  							),
   239  						),
   240  					},
   241  					Person{FirstName: "Han", LastName: "Solo"},
   242  					Person{FirstName: "Luke", LastName: "Skywalker"},
   243  				),
   244  				sortedRowGroup(
   245  					[]parquet.RowGroupOption{
   246  						parquet.SortingRowGroupConfig(
   247  							parquet.SortingColumns(
   248  								parquet.Ascending("LastName"),
   249  							),
   250  						),
   251  					},
   252  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   253  					Person{FirstName: "Anakin", LastName: "Skywalker"},
   254  				),
   255  			},
   256  			output: sortedRowGroup(
   257  				[]parquet.RowGroupOption{
   258  					parquet.SortingRowGroupConfig(
   259  						parquet.SortingColumns(
   260  							parquet.Ascending("LastName"),
   261  						),
   262  					),
   263  				},
   264  				LastNameOnly{LastName: "Solo"},
   265  				LastNameOnly{LastName: "Skywalker"},
   266  				LastNameOnly{LastName: "Skywalker"},
   267  				LastNameOnly{LastName: "Kenobi"},
   268  			),
   269  		},
   270  	}
   271  
   272  	for _, adapter := range []struct {
   273  		scenario string
   274  		function func(parquet.RowGroup) parquet.RowGroup
   275  	}{
   276  		{scenario: "buffer", function: selfRowGroup},
   277  		{scenario: "file", function: fileRowGroup},
   278  	} {
   279  		t.Run(adapter.scenario, func(t *testing.T) {
   280  			for _, test := range tests {
   281  				t.Run(test.scenario, func(t *testing.T) {
   282  					input := make([]parquet.RowGroup, len(test.input))
   283  					for i := range test.input {
   284  						input[i] = adapter.function(test.input[i])
   285  					}
   286  
   287  					merged, err := parquet.MergeRowGroups(test.input, test.options...)
   288  					if err != nil {
   289  						t.Fatal(err)
   290  					}
   291  					if merged.NumRows() != test.output.NumRows() {
   292  						t.Fatalf("the number of rows mismatch: want=%d got=%d", merged.NumRows(), test.output.NumRows())
   293  					}
   294  					if merged.Schema() != test.output.Schema() {
   295  						t.Fatalf("the row group schemas mismatch:\n%v\n%v", test.output.Schema(), merged.Schema())
   296  					}
   297  
   298  					options := []parquet.RowGroupOption{parquet.SchemaOf(Person{})}
   299  					options = append(options, test.options...)
   300  					// We test two views of the resulting row group: the one originally
   301  					// returned by MergeRowGroups, and one where the merged row group
   302  					// has been copied into a new buffer. The intent is to exercise both
   303  					// the row-by-row read as well as optimized code paths when CopyRows
   304  					// bypasses the ReadRow/WriteRow calls and the row group is written
   305  					// directly to the buffer by calling WriteRowsTo/WriteRowGroup.
   306  					mergedCopy := parquet.NewBuffer(options...)
   307  
   308  					totalRows := test.output.NumRows()
   309  					numRows, err := copyRowsAndClose(mergedCopy, merged.Rows())
   310  					if err != nil {
   311  						t.Fatal(err)
   312  					}
   313  					if numRows != totalRows {
   314  						t.Fatalf("wrong number of rows copied: want=%d got=%d", totalRows, numRows)
   315  					}
   316  
   317  					for _, merge := range []struct {
   318  						scenario string
   319  						rowGroup parquet.RowGroup
   320  					}{
   321  						{scenario: "self", rowGroup: merged},
   322  						{scenario: "copy", rowGroup: mergedCopy},
   323  					} {
   324  						t.Run(merge.scenario, func(t *testing.T) {
   325  							var expectedRows = test.output.Rows()
   326  							var mergedRows = merge.rowGroup.Rows()
   327  							var row1 = make([]parquet.Row, 1)
   328  							var row2 = make([]parquet.Row, 1)
   329  							var numRows int64
   330  
   331  							defer expectedRows.Close()
   332  							defer mergedRows.Close()
   333  
   334  							for {
   335  								_, err1 := expectedRows.ReadRows(row1)
   336  								n, err2 := mergedRows.ReadRows(row2)
   337  
   338  								if err1 != err2 {
   339  									// ReadRows may or may not return io.EOF
   340  									// when it reads the last row, so we test
   341  									// that the reference RowReader has also
   342  									// reached the end.
   343  									if err1 == nil && err2 == io.EOF {
   344  										_, err1 = expectedRows.ReadRows(row1[:0])
   345  									}
   346  									if err1 != io.EOF {
   347  										t.Fatalf("errors mismatched while comparing row %d/%d: want=%v got=%v", numRows, totalRows, err1, err2)
   348  									}
   349  								}
   350  
   351  								if n != 0 {
   352  									if !row1[0].Equal(row2[0]) {
   353  										t.Errorf("row at index %d/%d mismatch: want=%+v got=%+v", numRows, totalRows, row1[0], row2[0])
   354  									}
   355  									numRows++
   356  								}
   357  
   358  								if err1 != nil {
   359  									break
   360  								}
   361  							}
   362  
   363  							if numRows != totalRows {
   364  								t.Errorf("expected to read %d rows but %d were found", totalRows, numRows)
   365  							}
   366  						})
   367  					}
   368  
   369  				})
   370  			}
   371  		})
   372  	}
   373  }
   374  
   375  func TestMergeRowGroupsCursorsAreClosed(t *testing.T) {
   376  	type model struct {
   377  		A int
   378  	}
   379  
   380  	schema := parquet.SchemaOf(model{})
   381  	options := []parquet.RowGroupOption{
   382  		parquet.SortingRowGroupConfig(
   383  			parquet.SortingColumns(
   384  				parquet.Ascending(schema.Columns()[0]...),
   385  			),
   386  		),
   387  	}
   388  
   389  	prng := rand.New(rand.NewSource(0))
   390  	rowGroups := make([]parquet.RowGroup, numRowGroups)
   391  	rows := make([]*wrappedRows, 0, numRowGroups)
   392  
   393  	for i := range rowGroups {
   394  		rowGroups[i] = wrappedRowGroup{
   395  			RowGroup: sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, model{})...),
   396  			rowsCallback: func(r parquet.Rows) parquet.Rows {
   397  				wrapped := &wrappedRows{Rows: r}
   398  				rows = append(rows, wrapped)
   399  				return wrapped
   400  			},
   401  		}
   402  	}
   403  
   404  	m, err := parquet.MergeRowGroups(rowGroups, options...)
   405  	if err != nil {
   406  		t.Fatal(err)
   407  	}
   408  	func() {
   409  		mergedRows := m.Rows()
   410  		defer mergedRows.Close()
   411  
   412  		// Add 1 more slot to the buffer to force an io.EOF on the first read.
   413  		rbuf := make([]parquet.Row, (numRowGroups*rowsPerGroup)+1)
   414  		if _, err := mergedRows.ReadRows(rbuf); !errors.Is(err, io.EOF) {
   415  			t.Fatal(err)
   416  		}
   417  	}()
   418  
   419  	for i, wrapped := range rows {
   420  		if !wrapped.closed {
   421  			t.Fatalf("RowGroup %d not closed", i)
   422  		}
   423  	}
   424  }
   425  
   426  func TestMergeRowGroupsSeekToRow(t *testing.T) {
   427  	type model struct {
   428  		A int
   429  	}
   430  
   431  	schema := parquet.SchemaOf(model{})
   432  	options := []parquet.RowGroupOption{
   433  		parquet.SortingRowGroupConfig(
   434  			parquet.SortingColumns(
   435  				parquet.Ascending(schema.Columns()[0]...),
   436  			),
   437  		),
   438  	}
   439  
   440  	rowGroups := make([]parquet.RowGroup, numRowGroups)
   441  
   442  	counter := 0
   443  	for i := range rowGroups {
   444  		rows := make([]interface{}, 0, rowsPerGroup)
   445  		for j := 0; j < rowsPerGroup; j++ {
   446  			rows = append(rows, model{A: counter})
   447  			counter++
   448  		}
   449  		rowGroups[i] = sortedRowGroup(options, rows...)
   450  	}
   451  
   452  	m, err := parquet.MergeRowGroups(rowGroups, options...)
   453  	if err != nil {
   454  		t.Fatal(err)
   455  	}
   456  
   457  	func() {
   458  		mergedRows := m.Rows()
   459  		defer mergedRows.Close()
   460  
   461  		rbuf := make([]parquet.Row, 1)
   462  		cursor := int64(0)
   463  		for {
   464  			if err := mergedRows.SeekToRow(cursor); err != nil {
   465  				t.Fatal(err)
   466  			}
   467  
   468  			if _, err := mergedRows.ReadRows(rbuf); err != nil {
   469  				if errors.Is(err, io.EOF) {
   470  					break
   471  				}
   472  				t.Fatal(err)
   473  			}
   474  			v := model{}
   475  			if err := schema.Reconstruct(&v, rbuf[0]); err != nil {
   476  				t.Fatal(err)
   477  			}
   478  			if v.A != int(cursor) {
   479  				t.Fatalf("expected value %d, got %d", cursor, v.A)
   480  			}
   481  
   482  			cursor++
   483  		}
   484  	}()
   485  }
   486  
   487  func BenchmarkMergeRowGroups(b *testing.B) {
   488  	for _, test := range readerTests {
   489  		b.Run(test.scenario, func(b *testing.B) {
   490  			schema := parquet.SchemaOf(test.model)
   491  
   492  			options := []parquet.RowGroupOption{
   493  				parquet.SortingRowGroupConfig(
   494  					parquet.SortingColumns(
   495  						parquet.Ascending(schema.Columns()[0]...),
   496  					),
   497  				),
   498  			}
   499  
   500  			prng := rand.New(rand.NewSource(0))
   501  			rowGroups := make([]parquet.RowGroup, numRowGroups)
   502  
   503  			for i := range rowGroups {
   504  				rowGroups[i] = sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, test.model)...)
   505  			}
   506  
   507  			for n := 1; n <= numRowGroups; n++ {
   508  				b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) {
   509  					mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...)
   510  					if err != nil {
   511  						b.Fatal(err)
   512  					}
   513  
   514  					rows := mergedRowGroup.Rows()
   515  					rbuf := make([]parquet.Row, benchmarkRowsPerStep)
   516  					defer func() { rows.Close() }()
   517  
   518  					benchmarkRowsPerSecond(b, func() int {
   519  						n, err := rows.ReadRows(rbuf)
   520  						if err != nil {
   521  							if !errors.Is(err, io.EOF) {
   522  								b.Fatal(err)
   523  							}
   524  							rows.Close()
   525  							rows = mergedRowGroup.Rows()
   526  						}
   527  						return n
   528  					})
   529  				})
   530  			}
   531  		})
   532  	}
   533  }
   534  
   535  func BenchmarkMergeFiles(b *testing.B) {
   536  	rowGroupBuffers := make([]bytes.Buffer, numRowGroups)
   537  
   538  	for _, test := range readerTests {
   539  		b.Run(test.scenario, func(b *testing.B) {
   540  			schema := parquet.SchemaOf(test.model)
   541  
   542  			sortingOptions := []parquet.SortingOption{
   543  				parquet.SortingColumns(
   544  					parquet.Ascending(schema.Columns()[0]...),
   545  				),
   546  			}
   547  
   548  			options := []parquet.RowGroupOption{
   549  				schema,
   550  				parquet.SortingRowGroupConfig(
   551  					sortingOptions...,
   552  				),
   553  			}
   554  
   555  			buffer := parquet.NewBuffer(options...)
   556  
   557  			prng := rand.New(rand.NewSource(0))
   558  			files := make([]*parquet.File, numRowGroups)
   559  			rowGroups := make([]parquet.RowGroup, numRowGroups)
   560  
   561  			for i := range files {
   562  				for _, row := range randomRowsOf(prng, rowsPerGroup, test.model) {
   563  					buffer.Write(row)
   564  				}
   565  				sort.Sort(buffer)
   566  				rowGroupBuffers[i].Reset()
   567  				writer := parquet.NewWriter(&rowGroupBuffers[i],
   568  					schema,
   569  					parquet.SortingWriterConfig(
   570  						sortingOptions...,
   571  					),
   572  				)
   573  				_, err := copyRowsAndClose(writer, buffer.Rows())
   574  				if err != nil {
   575  					b.Fatal(err)
   576  				}
   577  				if err := writer.Close(); err != nil {
   578  					b.Fatal(err)
   579  				}
   580  				r := bytes.NewReader(rowGroupBuffers[i].Bytes())
   581  				f, err := parquet.OpenFile(r, r.Size())
   582  				if err != nil {
   583  					b.Fatal(err)
   584  				}
   585  				files[i], rowGroups[i] = f, f.RowGroups()[0]
   586  			}
   587  
   588  			for n := 1; n <= numRowGroups; n++ {
   589  				b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) {
   590  					mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...)
   591  					if err != nil {
   592  						b.Fatal(err)
   593  					}
   594  
   595  					rows := mergedRowGroup.Rows()
   596  					rbuf := make([]parquet.Row, benchmarkRowsPerStep)
   597  					defer func() { rows.Close() }()
   598  
   599  					benchmarkRowsPerSecond(b, func() int {
   600  						n, err := rows.ReadRows(rbuf)
   601  						if err != nil {
   602  							if !errors.Is(err, io.EOF) {
   603  								b.Fatal(err)
   604  							}
   605  							rows.Close()
   606  							rows = mergedRowGroup.Rows()
   607  						}
   608  						return n
   609  					})
   610  
   611  					totalSize := int64(0)
   612  					for _, f := range files[:n] {
   613  						totalSize += f.Size()
   614  					}
   615  				})
   616  			}
   617  		})
   618  	}
   619  }