github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/row_group_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"reflect"
     6  	"sort"
     7  	"testing"
     8  
     9  	"github.com/vc42/parquet-go"
    10  )
    11  
    12  func sortedRowGroup(options []parquet.RowGroupOption, rows ...interface{}) parquet.RowGroup {
    13  	buf := parquet.NewBuffer(options...)
    14  	for _, row := range rows {
    15  		buf.Write(row)
    16  	}
    17  	sort.Stable(buf)
    18  	return buf
    19  }
    20  
    21  type Person struct {
    22  	FirstName utf8string
    23  	LastName  utf8string
    24  	Age       int
    25  }
    26  
    27  type LastNameOnly struct {
    28  	LastName utf8string
    29  }
    30  
    31  func newPeopleBuffer(people []Person) parquet.RowGroup {
    32  	buffer := parquet.NewBuffer()
    33  	for i := range people {
    34  		buffer.Write(&people[i])
    35  	}
    36  	return buffer
    37  }
    38  
    39  func newPeopleFile(people []Person) parquet.RowGroup {
    40  	buffer := new(bytes.Buffer)
    41  	writer := parquet.NewWriter(buffer)
    42  	for i := range people {
    43  		writer.Write(&people[i])
    44  	}
    45  	writer.Close()
    46  	reader := bytes.NewReader(buffer.Bytes())
    47  	f, err := parquet.OpenFile(reader, reader.Size())
    48  	if err != nil {
    49  		panic(err)
    50  	}
    51  	return f.RowGroups()[0]
    52  }
    53  
    54  func TestSeekToRow(t *testing.T) {
    55  	for _, config := range []struct {
    56  		name        string
    57  		newRowGroup func([]Person) parquet.RowGroup
    58  	}{
    59  		{name: "buffer", newRowGroup: newPeopleBuffer},
    60  		{name: "file", newRowGroup: newPeopleFile},
    61  	} {
    62  		t.Run(config.name, func(t *testing.T) { testSeekToRow(t, config.newRowGroup) })
    63  	}
    64  }
    65  
    66  func testSeekToRow(t *testing.T, newRowGroup func([]Person) parquet.RowGroup) {
    67  	err := quickCheck(func(people []Person) bool {
    68  		if len(people) == 0 { // TODO: fix creation of empty parquet files
    69  			return true
    70  		}
    71  		rowGroup := newRowGroup(people)
    72  		rows := rowGroup.Rows()
    73  		rbuf := make([]parquet.Row, 1)
    74  		pers := Person{}
    75  		schema := parquet.SchemaOf(&pers)
    76  		defer rows.Close()
    77  
    78  		for i := range people {
    79  			if err := rows.SeekToRow(int64(i)); err != nil {
    80  				t.Errorf("seeking to row %d: %+v", i, err)
    81  				return false
    82  			}
    83  			if _, err := rows.ReadRows(rbuf); err != nil {
    84  				t.Errorf("reading row %d: %+v", i, err)
    85  				return false
    86  			}
    87  			if err := schema.Reconstruct(&pers, rbuf[0]); err != nil {
    88  				t.Errorf("deconstructing row %d: %+v", i, err)
    89  				return false
    90  			}
    91  			if !reflect.DeepEqual(&pers, &people[i]) {
    92  				t.Errorf("row %d mismatch", i)
    93  				return false
    94  			}
    95  		}
    96  
    97  		return true
    98  	})
    99  	if err != nil {
   100  		t.Error(err)
   101  	}
   102  }
   103  
   104  func selfRowGroup(rowGroup parquet.RowGroup) parquet.RowGroup {
   105  	return rowGroup
   106  }
   107  
   108  func fileRowGroup(rowGroup parquet.RowGroup) parquet.RowGroup {
   109  	buffer := new(bytes.Buffer)
   110  	writer := parquet.NewWriter(buffer)
   111  	if _, err := writer.WriteRowGroup(rowGroup); err != nil {
   112  		panic(err)
   113  	}
   114  	if err := writer.Close(); err != nil {
   115  		panic(err)
   116  	}
   117  	reader := bytes.NewReader(buffer.Bytes())
   118  	f, err := parquet.OpenFile(reader, reader.Size())
   119  	if err != nil {
   120  		panic(err)
   121  	}
   122  	return f.RowGroups()[0]
   123  }
   124  
   125  func TestMergeRowGroups(t *testing.T) {
   126  	tests := []struct {
   127  		scenario string
   128  		options  []parquet.RowGroupOption
   129  		input    []parquet.RowGroup
   130  		output   parquet.RowGroup
   131  	}{
   132  		{
   133  			scenario: "no row groups",
   134  			options: []parquet.RowGroupOption{
   135  				parquet.SchemaOf(Person{}),
   136  			},
   137  			output: sortedRowGroup(
   138  				[]parquet.RowGroupOption{
   139  					parquet.SchemaOf(Person{}),
   140  				},
   141  			),
   142  		},
   143  
   144  		{
   145  			scenario: "a single row group",
   146  			input: []parquet.RowGroup{
   147  				sortedRowGroup(nil,
   148  					Person{FirstName: "some", LastName: "one", Age: 30},
   149  					Person{FirstName: "some", LastName: "one else", Age: 31},
   150  					Person{FirstName: "and", LastName: "you", Age: 32},
   151  				),
   152  			},
   153  			output: sortedRowGroup(nil,
   154  				Person{FirstName: "some", LastName: "one", Age: 30},
   155  				Person{FirstName: "some", LastName: "one else", Age: 31},
   156  				Person{FirstName: "and", LastName: "you", Age: 32},
   157  			),
   158  		},
   159  
   160  		{
   161  			scenario: "two row groups without ordering",
   162  			input: []parquet.RowGroup{
   163  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}),
   164  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}),
   165  			},
   166  			output: sortedRowGroup(nil,
   167  				Person{FirstName: "some", LastName: "one", Age: 30},
   168  				Person{FirstName: "some", LastName: "one else", Age: 31},
   169  			),
   170  		},
   171  
   172  		{
   173  			scenario: "three row groups without ordering",
   174  			input: []parquet.RowGroup{
   175  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}),
   176  				sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}),
   177  				sortedRowGroup(nil, Person{FirstName: "question", LastName: "answer", Age: 42}),
   178  			},
   179  			output: sortedRowGroup(nil,
   180  				Person{FirstName: "some", LastName: "one", Age: 30},
   181  				Person{FirstName: "some", LastName: "one else", Age: 31},
   182  				Person{FirstName: "question", LastName: "answer", Age: 42},
   183  			),
   184  		},
   185  
   186  		{
   187  			scenario: "row groups sorted by ascending last name",
   188  			options: []parquet.RowGroupOption{
   189  				parquet.SortingColumns(
   190  					parquet.Ascending("LastName"),
   191  				),
   192  			},
   193  			input: []parquet.RowGroup{
   194  				sortedRowGroup(
   195  					[]parquet.RowGroupOption{
   196  						parquet.SortingColumns(
   197  							parquet.Ascending("LastName"),
   198  						),
   199  					},
   200  					Person{FirstName: "Han", LastName: "Solo"},
   201  					Person{FirstName: "Luke", LastName: "Skywalker"},
   202  				),
   203  				sortedRowGroup(
   204  					[]parquet.RowGroupOption{
   205  						parquet.SortingColumns(
   206  							parquet.Ascending("LastName"),
   207  						),
   208  					},
   209  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   210  				),
   211  			},
   212  			output: sortedRowGroup(nil,
   213  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   214  				Person{FirstName: "Luke", LastName: "Skywalker"},
   215  				Person{FirstName: "Han", LastName: "Solo"},
   216  			),
   217  		},
   218  
   219  		{
   220  			scenario: "row groups sorted by descending last name",
   221  			options: []parquet.RowGroupOption{
   222  				parquet.SortingColumns(
   223  					parquet.Descending("LastName"),
   224  				),
   225  			},
   226  			input: []parquet.RowGroup{
   227  				sortedRowGroup(
   228  					[]parquet.RowGroupOption{
   229  						parquet.SortingColumns(
   230  							parquet.Descending("LastName"),
   231  						),
   232  					},
   233  					Person{FirstName: "Han", LastName: "Solo"},
   234  					Person{FirstName: "Luke", LastName: "Skywalker"},
   235  				),
   236  				sortedRowGroup(
   237  					[]parquet.RowGroupOption{
   238  						parquet.SortingColumns(
   239  							parquet.Descending("LastName"),
   240  						),
   241  					},
   242  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   243  				),
   244  			},
   245  			output: sortedRowGroup(nil,
   246  				Person{FirstName: "Han", LastName: "Solo"},
   247  				Person{FirstName: "Luke", LastName: "Skywalker"},
   248  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   249  			),
   250  		},
   251  
   252  		{
   253  			scenario: "row groups sorted by ascending last and first name",
   254  			options: []parquet.RowGroupOption{
   255  				parquet.SortingColumns(
   256  					parquet.Ascending("LastName"),
   257  					parquet.Ascending("FirstName"),
   258  				),
   259  			},
   260  			input: []parquet.RowGroup{
   261  				sortedRowGroup(
   262  					[]parquet.RowGroupOption{
   263  						parquet.SortingColumns(
   264  							parquet.Ascending("LastName"),
   265  							parquet.Ascending("FirstName"),
   266  						),
   267  					},
   268  					Person{FirstName: "Luke", LastName: "Skywalker"},
   269  					Person{FirstName: "Han", LastName: "Solo"},
   270  				),
   271  				sortedRowGroup(
   272  					[]parquet.RowGroupOption{
   273  						parquet.SortingColumns(
   274  							parquet.Ascending("LastName"),
   275  							parquet.Ascending("FirstName"),
   276  						),
   277  					},
   278  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   279  					Person{FirstName: "Anakin", LastName: "Skywalker"},
   280  				),
   281  			},
   282  			output: sortedRowGroup(nil,
   283  				Person{FirstName: "Obiwan", LastName: "Kenobi"},
   284  				Person{FirstName: "Anakin", LastName: "Skywalker"},
   285  				Person{FirstName: "Luke", LastName: "Skywalker"},
   286  				Person{FirstName: "Han", LastName: "Solo"},
   287  			),
   288  		},
   289  
   290  		{
   291  			scenario: "row groups with conversion to a different schema",
   292  			options: []parquet.RowGroupOption{
   293  				parquet.SchemaOf(LastNameOnly{}),
   294  				parquet.SortingColumns(
   295  					parquet.Ascending("LastName"),
   296  				),
   297  			},
   298  			input: []parquet.RowGroup{
   299  				sortedRowGroup(
   300  					[]parquet.RowGroupOption{
   301  						parquet.SortingColumns(
   302  							parquet.Ascending("LastName"),
   303  						),
   304  					},
   305  					Person{FirstName: "Han", LastName: "Solo"},
   306  					Person{FirstName: "Luke", LastName: "Skywalker"},
   307  				),
   308  				sortedRowGroup(
   309  					[]parquet.RowGroupOption{
   310  						parquet.SortingColumns(
   311  							parquet.Ascending("LastName"),
   312  						),
   313  					},
   314  					Person{FirstName: "Obiwan", LastName: "Kenobi"},
   315  					Person{FirstName: "Anakin", LastName: "Skywalker"},
   316  				),
   317  			},
   318  			output: sortedRowGroup(
   319  				[]parquet.RowGroupOption{
   320  					parquet.SortingColumns(
   321  						parquet.Ascending("LastName"),
   322  					),
   323  				},
   324  				LastNameOnly{LastName: "Solo"},
   325  				LastNameOnly{LastName: "Skywalker"},
   326  				LastNameOnly{LastName: "Skywalker"},
   327  				LastNameOnly{LastName: "Kenobi"},
   328  			),
   329  		},
   330  	}
   331  
   332  	for _, adapter := range []struct {
   333  		scenario string
   334  		function func(parquet.RowGroup) parquet.RowGroup
   335  	}{
   336  		{scenario: "buffer", function: selfRowGroup},
   337  		{scenario: "file", function: fileRowGroup},
   338  	} {
   339  		t.Run(adapter.scenario, func(t *testing.T) {
   340  			for _, test := range tests {
   341  				t.Run(test.scenario, func(t *testing.T) {
   342  					input := make([]parquet.RowGroup, len(test.input))
   343  					for i := range test.input {
   344  						input[i] = adapter.function(test.input[i])
   345  					}
   346  
   347  					merged, err := parquet.MergeRowGroups(test.input, test.options...)
   348  					if err != nil {
   349  						t.Fatal(err)
   350  					}
   351  					if merged.NumRows() != test.output.NumRows() {
   352  						t.Fatalf("the number of rows mismatch: want=%d got=%d", merged.NumRows(), test.output.NumRows())
   353  					}
   354  					if merged.Schema() != test.output.Schema() {
   355  						t.Fatalf("the row group schemas mismatch:\n%v\n%v", test.output.Schema(), merged.Schema())
   356  					}
   357  
   358  					options := []parquet.RowGroupOption{parquet.SchemaOf(Person{})}
   359  					options = append(options, test.options...)
   360  					// We test two views of the resulting row group: the one originally
   361  					// returned by MergeRowGroups, and one where the merged row group
   362  					// has been copied into a new buffer. The intent is to exercise both
   363  					// the row-by-row read as well as optimized code paths when CopyRows
   364  					// bypasses the ReadRow/WriteRow calls and the row group is written
   365  					// directly to the buffer by calling WriteRowsTo/WriteRowGroup.
   366  					mergedCopy := parquet.NewBuffer(options...)
   367  
   368  					totalRows := test.output.NumRows()
   369  					numRows, err := copyRowsAndClose(mergedCopy, merged.Rows())
   370  					if err != nil {
   371  						t.Fatal(err)
   372  					}
   373  					if numRows != totalRows {
   374  						t.Fatalf("wrong number of rows copied: want=%d got=%d", totalRows, numRows)
   375  					}
   376  
   377  					for _, merge := range []struct {
   378  						scenario string
   379  						rowGroup parquet.RowGroup
   380  					}{
   381  						{scenario: "self", rowGroup: merged},
   382  						{scenario: "copy", rowGroup: mergedCopy},
   383  					} {
   384  						t.Run(merge.scenario, func(t *testing.T) {
   385  							var expectedRows = test.output.Rows()
   386  							var mergedRows = merge.rowGroup.Rows()
   387  							var row1 = make([]parquet.Row, 1)
   388  							var row2 = make([]parquet.Row, 1)
   389  							var numRows int64
   390  
   391  							defer expectedRows.Close()
   392  							defer mergedRows.Close()
   393  
   394  							for {
   395  								_, err1 := expectedRows.ReadRows(row1)
   396  								_, err2 := mergedRows.ReadRows(row2)
   397  
   398  								if err1 != err2 {
   399  									t.Fatalf("errors mismatched while comparing row %d/%d: want=%v got=%v", numRows, totalRows, err1, err2)
   400  								}
   401  
   402  								if err1 != nil {
   403  									break
   404  								}
   405  
   406  								if !row1[0].Equal(row2[0]) {
   407  									t.Errorf("row at index %d/%d mismatch: want=%+v got=%+v", numRows, totalRows, row1[0], row2[0])
   408  								}
   409  
   410  								numRows++
   411  							}
   412  
   413  							if numRows != totalRows {
   414  								t.Errorf("expected to read %d rows but %d were found", totalRows, numRows)
   415  							}
   416  						})
   417  					}
   418  
   419  				})
   420  			}
   421  		})
   422  	}
   423  }