github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/writer_go18_test.go (about)

     1  //go:build go1.18
     2  
     3  package parquet_test
     4  
     5  import (
     6  	"bytes"
     7  	"io"
     8  	"math/rand"
     9  	"reflect"
    10  	"testing"
    11  
    12  	"github.com/segmentio/parquet-go"
    13  )
    14  
    15  func BenchmarkGenericWriter(b *testing.B) {
    16  	benchmarkGenericWriter[benchmarkRowType](b)
    17  	benchmarkGenericWriter[booleanColumn](b)
    18  	benchmarkGenericWriter[int32Column](b)
    19  	benchmarkGenericWriter[int64Column](b)
    20  	benchmarkGenericWriter[floatColumn](b)
    21  	benchmarkGenericWriter[doubleColumn](b)
    22  	benchmarkGenericWriter[byteArrayColumn](b)
    23  	benchmarkGenericWriter[fixedLenByteArrayColumn](b)
    24  	benchmarkGenericWriter[stringColumn](b)
    25  	benchmarkGenericWriter[indexedStringColumn](b)
    26  	benchmarkGenericWriter[uuidColumn](b)
    27  	benchmarkGenericWriter[timeColumn](b)
    28  	benchmarkGenericWriter[timeInMillisColumn](b)
    29  	benchmarkGenericWriter[mapColumn](b)
    30  	benchmarkGenericWriter[decimalColumn](b)
    31  	benchmarkGenericWriter[contact](b)
    32  	benchmarkGenericWriter[paddedBooleanColumn](b)
    33  	benchmarkGenericWriter[optionalInt32Column](b)
    34  	benchmarkGenericWriter[repeatedInt32Column](b)
    35  }
    36  
    37  func benchmarkGenericWriter[Row generator[Row]](b *testing.B) {
    38  	var model Row
    39  	b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) {
    40  		prng := rand.New(rand.NewSource(0))
    41  		rows := make([]Row, benchmarkNumRows)
    42  		for i := range rows {
    43  			rows[i] = rows[i].generate(prng)
    44  		}
    45  
    46  		b.Run("go1.17", func(b *testing.B) {
    47  			writer := parquet.NewWriter(io.Discard, parquet.SchemaOf(rows[0]))
    48  			i := 0
    49  			benchmarkRowsPerSecond(b, func() int {
    50  				for j := 0; j < benchmarkRowsPerStep; j++ {
    51  					if err := writer.Write(&rows[i]); err != nil {
    52  						b.Fatal(err)
    53  					}
    54  				}
    55  
    56  				i += benchmarkRowsPerStep
    57  				i %= benchmarkNumRows
    58  
    59  				if i == 0 {
    60  					writer.Close()
    61  					writer.Reset(io.Discard)
    62  				}
    63  				return benchmarkRowsPerStep
    64  			})
    65  		})
    66  
    67  		b.Run("go1.18", func(b *testing.B) {
    68  			writer := parquet.NewGenericWriter[Row](io.Discard)
    69  			i := 0
    70  			benchmarkRowsPerSecond(b, func() int {
    71  				n, err := writer.Write(rows[i : i+benchmarkRowsPerStep])
    72  				if err != nil {
    73  					b.Fatal(err)
    74  				}
    75  
    76  				i += benchmarkRowsPerStep
    77  				i %= benchmarkNumRows
    78  
    79  				if i == 0 {
    80  					writer.Close()
    81  					writer.Reset(io.Discard)
    82  				}
    83  				return n
    84  			})
    85  		})
    86  	})
    87  }
    88  
    89  func TestIssue272(t *testing.T) {
    90  	type T2 struct {
    91  		X string `parquet:",dict,optional"`
    92  	}
    93  
    94  	type T1 struct {
    95  		TA *T2
    96  		TB *T2
    97  	}
    98  
    99  	type T struct {
   100  		T1 *T1
   101  	}
   102  
   103  	const nRows = 1
   104  
   105  	row := T{
   106  		T1: &T1{
   107  			TA: &T2{
   108  				X: "abc",
   109  			},
   110  		},
   111  	}
   112  
   113  	rows := make([]T, nRows)
   114  	for i := range rows {
   115  		rows[i] = row
   116  	}
   117  
   118  	b := new(bytes.Buffer)
   119  	w := parquet.NewGenericWriter[T](b)
   120  
   121  	if _, err := w.Write(rows); err != nil {
   122  		t.Fatal(err)
   123  	}
   124  	if err := w.Close(); err != nil {
   125  		t.Fatal(err)
   126  	}
   127  
   128  	f := bytes.NewReader(b.Bytes())
   129  	r := parquet.NewGenericReader[T](f)
   130  
   131  	parquetRows := make([]parquet.Row, nRows)
   132  	n, err := r.ReadRows(parquetRows)
   133  	if err != nil && err != io.EOF {
   134  		t.Fatal(err)
   135  	}
   136  	if n != nRows {
   137  		t.Fatalf("wrong number of rows read: want=%d got=%d", nRows, n)
   138  	}
   139  	for _, r := range parquetRows {
   140  		if d := r[0].DefinitionLevel(); d != 3 {
   141  			t.Errorf("wrong definition level for column 0: %d", d)
   142  		}
   143  		if d := r[1].DefinitionLevel(); d != 1 {
   144  			t.Errorf("wrong definition level for column 1: %d", d)
   145  		}
   146  	}
   147  }
   148  
   149  func TestIssue279(t *testing.T) {
   150  	type T2 struct {
   151  		Id   int    `parquet:",plain,optional"`
   152  		Name string `parquet:",plain,optional"`
   153  	}
   154  
   155  	type T1 struct {
   156  		TA []*T2
   157  	}
   158  
   159  	type T struct {
   160  		T1 *T1
   161  	}
   162  
   163  	const nRows = 1
   164  
   165  	row := T{
   166  		T1: &T1{
   167  			TA: []*T2{
   168  				{
   169  					Id:   43,
   170  					Name: "john",
   171  				},
   172  			},
   173  		},
   174  	}
   175  
   176  	rows := make([]T, nRows)
   177  	for i := range rows {
   178  		rows[i] = row
   179  	}
   180  
   181  	b := new(bytes.Buffer)
   182  	w := parquet.NewGenericWriter[T](b)
   183  
   184  	if _, err := w.Write(rows); err != nil {
   185  		t.Fatal(err)
   186  	}
   187  	if err := w.Close(); err != nil {
   188  		t.Fatal(err)
   189  	}
   190  
   191  	f := bytes.NewReader(b.Bytes())
   192  	r := parquet.NewGenericReader[T](f)
   193  
   194  	parquetRows := make([]parquet.Row, nRows)
   195  	n, err := r.ReadRows(parquetRows)
   196  	if err != nil && err != io.EOF {
   197  		t.Fatal(err)
   198  	}
   199  	if n != nRows {
   200  		t.Fatalf("wrong number of rows read: want=%d got=%d", nRows, n)
   201  	}
   202  	for _, r := range parquetRows {
   203  		if d := r[0].DefinitionLevel(); d != 3 {
   204  			t.Errorf("wrong definition level for column 0: %d", d)
   205  		}
   206  		if d := r[1].DefinitionLevel(); d != 3 {
   207  			t.Errorf("wrong definition level for column 1: %d", d)
   208  		}
   209  	}
   210  }
   211  
   212  func TestIssue302(t *testing.T) {
   213  	tests := []struct {
   214  		name string
   215  		fn   func(t *testing.T)
   216  	}{
   217  		{
   218  			name: "SimpleMap",
   219  			fn: func(t *testing.T) {
   220  				type M map[string]int
   221  
   222  				type T struct {
   223  					M M `parquet:","`
   224  				}
   225  
   226  				b := new(bytes.Buffer)
   227  				_ = parquet.NewGenericWriter[T](b)
   228  
   229  			},
   230  		},
   231  
   232  		{
   233  			name: "MapWithValueTag",
   234  			fn: func(t *testing.T) {
   235  				type M map[string]int
   236  
   237  				type T struct {
   238  					M M `parquet:"," parquet-value:",zstd"`
   239  				}
   240  
   241  				b := new(bytes.Buffer)
   242  				_ = parquet.NewGenericWriter[T](b)
   243  
   244  			},
   245  		},
   246  
   247  		{
   248  			name: "MapWithOptionalTag",
   249  			fn: func(t *testing.T) {
   250  				type M map[string]int
   251  
   252  				type T struct {
   253  					M M `parquet:",optional"`
   254  				}
   255  
   256  				b := new(bytes.Buffer)
   257  				w := parquet.NewGenericWriter[T](b)
   258  				expect := []T{
   259  					{
   260  						M: M{
   261  							"Holden": 1,
   262  							"Naomi":  2,
   263  						},
   264  					},
   265  					{
   266  						M: nil,
   267  					},
   268  					{
   269  						M: M{
   270  							"Naomi":  1,
   271  							"Holden": 2,
   272  						},
   273  					},
   274  				}
   275  				_, err := w.Write(expect)
   276  				if err != nil {
   277  					t.Fatal(err)
   278  				}
   279  				if err = w.Close(); err != nil {
   280  					t.Fatal(err)
   281  				}
   282  
   283  				bufReader := bytes.NewReader(b.Bytes())
   284  				r := parquet.NewGenericReader[T](bufReader)
   285  				values := make([]T, 3)
   286  				_, err = r.Read(values)
   287  				if !reflect.DeepEqual(expect, values) {
   288  					t.Fatalf("values do not match.\n\texpect: %v\n\tactual: %v", expect, values)
   289  				}
   290  			},
   291  		},
   292  	}
   293  
   294  	for _, test := range tests {
   295  		t.Run(test.name, test.fn)
   296  	}
   297  }
   298  
   299  func TestIssue347Writer(t *testing.T) {
   300  	type TestType struct {
   301  		Key int
   302  	}
   303  
   304  	b := new(bytes.Buffer)
   305  	// instantiating with concrete type shouldn't panic
   306  	_ = parquet.NewGenericWriter[TestType](b)
   307  
   308  	// instantiating with schema and interface type parameter shouldn't panic
   309  	schema := parquet.SchemaOf(TestType{})
   310  	_ = parquet.NewGenericWriter[any](b, schema)
   311  
   312  	defer func() {
   313  		if r := recover(); r == nil {
   314  			t.Errorf("instantiating generic buffer without schema and with interface " +
   315  				"type parameter should panic")
   316  		}
   317  	}()
   318  	_ = parquet.NewGenericWriter[any](b)
   319  }
   320  
   321  func TestIssue375(t *testing.T) {
   322  	type Row struct{ FirstName, LastName string }
   323  
   324  	output := new(bytes.Buffer)
   325  	writer := parquet.NewGenericWriter[Row](output, parquet.MaxRowsPerRowGroup(10))
   326  
   327  	rows := make([]Row, 100)
   328  	for i := range rows {
   329  		rows[i] = Row{
   330  			FirstName: "0123456789"[i%10 : i%10+1],
   331  			LastName:  "foo",
   332  		}
   333  	}
   334  
   335  	n, err := writer.Write(rows)
   336  	if err != nil {
   337  		t.Fatal(err)
   338  	}
   339  	if n != len(rows) {
   340  		t.Fatal("wrong number of rows written:", n)
   341  	}
   342  
   343  	if err := writer.Close(); err != nil {
   344  		t.Fatal(err)
   345  	}
   346  
   347  	f, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len()))
   348  	if err != nil {
   349  		t.Fatal(err)
   350  	}
   351  
   352  	rowGroups := f.RowGroups()
   353  	if len(rowGroups) != 10 {
   354  		t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups))
   355  	}
   356  }
   357  
   358  func TestGenericSetKeyValueMetadata(t *testing.T) {
   359  	testKey := "test-key"
   360  	testValue := "test-value"
   361  
   362  	type Row struct{ FirstName, LastName string }
   363  
   364  	output := new(bytes.Buffer)
   365  	writer := parquet.NewGenericWriter[Row](output, parquet.MaxRowsPerRowGroup(10))
   366  
   367  	rows := []Row{
   368  		{FirstName: "First", LastName: "Last"},
   369  	}
   370  
   371  	_, err := writer.Write(rows)
   372  	if err != nil {
   373  		t.Fatal(err)
   374  	}
   375  
   376  	writer.SetKeyValueMetadata(testKey, testValue)
   377  
   378  	err = writer.Close()
   379  	if err != nil {
   380  		t.Fatal(err)
   381  	}
   382  
   383  	f, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len()))
   384  	if err != nil {
   385  		t.Fatal(err)
   386  	}
   387  
   388  	value, ok := f.Lookup(testKey)
   389  	if !ok {
   390  		t.Fatalf("key/value metadata should have included %q", testKey)
   391  	}
   392  	if value != testValue {
   393  		t.Errorf("expected %q, got %q", testValue, value)
   394  	}
   395  }