github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  	"math/rand"
     9  	"reflect"
    10  	"testing"
    11  
    12  	"github.com/segmentio/parquet-go"
    13  	"github.com/segmentio/parquet-go/internal/quick"
    14  )
    15  
    16  func rowsOf(numRows int, model interface{}) rows {
    17  	prng := rand.New(rand.NewSource(0))
    18  	return randomRowsOf(prng, numRows, model)
    19  }
    20  
    21  func randomRowsOf(prng *rand.Rand, numRows int, model interface{}) rows {
    22  	typ := reflect.TypeOf(model)
    23  	rows := make(rows, numRows)
    24  	makeValue := quick.MakeValueFuncOf(typ)
    25  	for i := range rows {
    26  		v := reflect.New(typ).Elem()
    27  		makeValue(v, prng)
    28  		rows[i] = v.Interface()
    29  	}
    30  	return rows
    31  }
    32  
    33  var readerTests = []struct {
    34  	scenario string
    35  	model    interface{}
    36  }{
    37  	{
    38  		scenario: "BOOLEAN",
    39  		model:    booleanColumn{},
    40  	},
    41  
    42  	{
    43  		scenario: "INT32",
    44  		model:    int32Column{},
    45  	},
    46  
    47  	{
    48  		scenario: "INT64",
    49  		model:    int64Column{},
    50  	},
    51  
    52  	{
    53  		scenario: "INT96",
    54  		model:    int96Column{},
    55  	},
    56  
    57  	{
    58  		scenario: "FLOAT",
    59  		model:    floatColumn{},
    60  	},
    61  
    62  	{
    63  		scenario: "DOUBLE",
    64  		model:    doubleColumn{},
    65  	},
    66  
    67  	{
    68  		scenario: "BYTE_ARRAY",
    69  		model:    byteArrayColumn{},
    70  	},
    71  
    72  	{
    73  		scenario: "FIXED_LEN_BYTE_ARRAY",
    74  		model:    fixedLenByteArrayColumn{},
    75  	},
    76  
    77  	{
    78  		scenario: "STRING",
    79  		model:    stringColumn{},
    80  	},
    81  
    82  	{
    83  		scenario: "STRING (dict)",
    84  		model:    indexedStringColumn{},
    85  	},
    86  
    87  	{
    88  		scenario: "UUID",
    89  		model:    uuidColumn{},
    90  	},
    91  
    92  	{
    93  		scenario: "time.Time",
    94  		model:    timeColumn{},
    95  	},
    96  
    97  	{
    98  		scenario: "time.Time in ms",
    99  		model:    timeInMillisColumn{},
   100  	},
   101  
   102  	{
   103  		scenario: "DECIMAL",
   104  		model:    decimalColumn{},
   105  	},
   106  
   107  	{
   108  		scenario: "AddressBook",
   109  		model:    addressBook{},
   110  	},
   111  
   112  	{
   113  		scenario: "one optional level",
   114  		model:    listColumn2{},
   115  	},
   116  
   117  	{
   118  		scenario: "one repeated level",
   119  		model:    listColumn1{},
   120  	},
   121  
   122  	{
   123  		scenario: "two repeated levels",
   124  		model:    listColumn0{},
   125  	},
   126  
   127  	{
   128  		scenario: "three repeated levels",
   129  		model:    listColumn0{},
   130  	},
   131  
   132  	{
   133  		scenario: "nested lists",
   134  		model:    nestedListColumn{},
   135  	},
   136  
   137  	{
   138  		scenario: "key-value pairs",
   139  		model: struct {
   140  			KeyValuePairs map[utf8string]utf8string
   141  		}{},
   142  	},
   143  
   144  	{
   145  		scenario: "multiple key-value pairs",
   146  		model: struct {
   147  			KeyValuePairs0 map[utf8string]utf8string
   148  			KeyValuePairs1 map[utf8string]utf8string
   149  			KeyValuePairs2 map[utf8string]utf8string
   150  		}{},
   151  	},
   152  
   153  	{
   154  		scenario: "repeated key-value pairs",
   155  		model: struct {
   156  			RepeatedKeyValuePairs []map[utf8string]utf8string
   157  		}{},
   158  	},
   159  
   160  	{
   161  		scenario: "map of repeated values",
   162  		model: struct {
   163  			MapOfRepeated map[utf8string][]utf8string
   164  		}{},
   165  	},
   166  }
   167  
   168  func TestReader(t *testing.T) {
   169  	buf := new(bytes.Buffer)
   170  	file := bytes.NewReader(nil)
   171  
   172  	for _, test := range readerTests {
   173  		t.Run(test.scenario, func(t *testing.T) {
   174  			const N = 42
   175  
   176  			rowType := reflect.TypeOf(test.model)
   177  			rowPtr := reflect.New(rowType)
   178  			rowZero := reflect.Zero(rowType)
   179  			rowValue := rowPtr.Elem()
   180  
   181  			for n := 1; n < N; n++ {
   182  				t.Run(fmt.Sprintf("N=%d", n), func(t *testing.T) {
   183  					defer buf.Reset()
   184  					rows := rowsOf(n, test.model)
   185  
   186  					if err := writeParquetFileWithBuffer(buf, rows); err != nil {
   187  						t.Fatal(err)
   188  					}
   189  
   190  					file.Reset(buf.Bytes())
   191  					r := parquet.NewReader(file, parquet.SchemaOf(test.model))
   192  
   193  					for i, v := range rows {
   194  						if err := r.Read(rowPtr.Interface()); err != nil {
   195  							t.Fatal(err)
   196  						}
   197  						if !reflect.DeepEqual(rowValue.Interface(), v) {
   198  							t.Errorf("row mismatch at index %d\nwant = %+v\ngot  = %+v", i, v, rowValue.Interface())
   199  						}
   200  						rowValue.Set(rowZero)
   201  					}
   202  
   203  					if err := r.Read(rowPtr.Interface()); err != io.EOF {
   204  						t.Errorf("expected EOF after reading all values but got: %v", err)
   205  					}
   206  				})
   207  			}
   208  		})
   209  	}
   210  }
   211  
   212  func BenchmarkReaderReadType(b *testing.B) {
   213  	buf := new(bytes.Buffer)
   214  	file := bytes.NewReader(nil)
   215  
   216  	for _, test := range readerTests {
   217  		b.Run(test.scenario, func(b *testing.B) {
   218  			defer buf.Reset()
   219  			rows := rowsOf(benchmarkNumRows, test.model)
   220  
   221  			if err := writeParquetFile(buf, rows); err != nil {
   222  				b.Fatal(err)
   223  			}
   224  			file.Reset(buf.Bytes())
   225  			f, err := parquet.OpenFile(file, file.Size())
   226  			if err != nil {
   227  				b.Fatal(err)
   228  			}
   229  
   230  			rowType := reflect.TypeOf(test.model)
   231  			rowPtr := reflect.New(rowType)
   232  			rowZero := reflect.Zero(rowType)
   233  			rowValue := rowPtr.Elem()
   234  
   235  			r := parquet.NewReader(f)
   236  			p := rowPtr.Interface()
   237  
   238  			benchmarkRowsPerSecond(b, func() (n int) {
   239  				for i := 0; i < benchmarkRowsPerStep; i++ {
   240  					if err := r.Read(p); err != nil {
   241  						if err == io.EOF {
   242  							r.Reset()
   243  						} else {
   244  							b.Fatal(err)
   245  						}
   246  					}
   247  				}
   248  				rowValue.Set(rowZero)
   249  				return benchmarkRowsPerStep
   250  			})
   251  
   252  			b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows)))
   253  		})
   254  	}
   255  }
   256  
   257  func BenchmarkReaderReadRow(b *testing.B) {
   258  	buf := new(bytes.Buffer)
   259  	file := bytes.NewReader(nil)
   260  
   261  	for _, test := range readerTests {
   262  		b.Run(test.scenario, func(b *testing.B) {
   263  			defer buf.Reset()
   264  			rows := rowsOf(benchmarkNumRows, test.model)
   265  
   266  			if err := writeParquetFile(buf, rows); err != nil {
   267  				b.Fatal(err)
   268  			}
   269  			file.Reset(buf.Bytes())
   270  			f, err := parquet.OpenFile(file, file.Size())
   271  			if err != nil {
   272  				b.Fatal(err)
   273  			}
   274  
   275  			r := parquet.NewReader(f)
   276  			rowbuf := make([]parquet.Row, benchmarkRowsPerStep)
   277  
   278  			benchmarkRowsPerSecond(b, func() int {
   279  				n, err := r.ReadRows(rowbuf)
   280  				if err != nil {
   281  					if err == io.EOF {
   282  						r.Reset()
   283  					} else {
   284  						b.Fatal(err)
   285  					}
   286  				}
   287  				return n
   288  			})
   289  
   290  			b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows)))
   291  		})
   292  	}
   293  }
   294  
   295  func TestReaderReadSubset(t *testing.T) {
   296  	// In this example we'll write 3 columns to the file - X, Y, and Z, but
   297  	// we'll only read out the X and Y columns. Returns true if all writes
   298  	// and reads were successful, and false otherwise.
   299  	type Point3D struct{ X, Y, Z int64 }
   300  	type Point2D struct{ X, Y int64 }
   301  
   302  	err := quickCheck(func(points3D []Point3D) bool {
   303  		if len(points3D) == 0 {
   304  			return true
   305  		}
   306  		buf := new(bytes.Buffer)
   307  		err := writeParquetFile(buf, makeRows(points3D))
   308  		if err != nil {
   309  			t.Error(err)
   310  			return false
   311  		}
   312  		reader := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   313  		for i := 0; ; i++ {
   314  			row := Point2D{}
   315  			err := reader.Read(&row)
   316  			if err != nil {
   317  				if err == io.EOF && i == len(points3D) {
   318  					break
   319  				}
   320  				t.Error(err)
   321  				return false
   322  			}
   323  			if row != (Point2D{X: points3D[i].X, Y: points3D[i].Y}) {
   324  				t.Errorf("points mismatch at row index %d: want=%v got=%v", i, points3D[i], row)
   325  				return false
   326  			}
   327  		}
   328  		return true
   329  	})
   330  	if err != nil {
   331  		t.Error(err)
   332  	}
   333  }
   334  
   335  func TestReaderSeekToRow(t *testing.T) {
   336  	type rowType struct {
   337  		Name utf8string `parquet:",dict"`
   338  	}
   339  
   340  	rows := rowsOf(10, rowType{})
   341  	buf := new(bytes.Buffer)
   342  	err := writeParquetFile(buf, rows)
   343  	if err != nil {
   344  		t.Fatal(err)
   345  	}
   346  
   347  	reader := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   348  	for i := 0; i < 10; i++ {
   349  		if err := reader.SeekToRow(int64(i)); err != nil {
   350  			t.Fatalf("seek to row %d: %v", i, err)
   351  		}
   352  
   353  		row := new(rowType)
   354  		err := reader.Read(row)
   355  		if err != nil {
   356  			t.Fatalf("reading row %d: %v", i, err)
   357  		}
   358  
   359  		if *row != rows[i] {
   360  			t.Fatalf("row %d mismatch: got=%+v want=%+v", i, *row, rows[i])
   361  		}
   362  	}
   363  }
   364  
   365  func TestSeekToRowNoDict(t *testing.T) {
   366  	type rowType struct {
   367  		Name utf8string `parquet:","` // no dictionary encoding
   368  	}
   369  
   370  	// write samples to in-memory buffer
   371  	buf := new(bytes.Buffer)
   372  	schema := parquet.SchemaOf(new(rowType))
   373  	w := parquet.NewWriter(buf, schema)
   374  	sample := rowType{
   375  		Name: "foo1",
   376  	}
   377  	// write two rows
   378  	w.Write(sample)
   379  	sample.Name = "foo2"
   380  	w.Write(sample)
   381  	w.Close()
   382  
   383  	// create reader
   384  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   385  
   386  	// read second row
   387  	r.SeekToRow(1)
   388  	row := new(rowType)
   389  	err := r.Read(row)
   390  	if err != nil {
   391  		t.Fatalf("reading row: %v", err)
   392  	}
   393  	// fmt.Println(&sample, row)
   394  	if *row != sample {
   395  		t.Fatalf("read != write")
   396  	}
   397  }
   398  
   399  func TestSeekToRowReadAll(t *testing.T) {
   400  	type rowType struct {
   401  		Name utf8string `parquet:",dict"`
   402  	}
   403  
   404  	// write samples to in-memory buffer
   405  	buf := new(bytes.Buffer)
   406  	schema := parquet.SchemaOf(new(rowType))
   407  	w := parquet.NewWriter(buf, schema)
   408  	sample := rowType{
   409  		Name: "foo1",
   410  	}
   411  	// write two rows
   412  	w.Write(sample)
   413  	sample.Name = "foo2"
   414  	w.Write(sample)
   415  	w.Close()
   416  
   417  	// create reader
   418  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   419  
   420  	// read first row
   421  	r.SeekToRow(0)
   422  	row := new(rowType)
   423  	err := r.Read(row)
   424  	if err != nil {
   425  		t.Fatalf("reading row: %v", err)
   426  	}
   427  	// read second row
   428  	r.SeekToRow(1)
   429  	row = new(rowType)
   430  	err = r.Read(row)
   431  	if err != nil {
   432  		t.Fatalf("reading row: %v", err)
   433  	}
   434  	// fmt.Println(&sample, row)
   435  	if *row != sample {
   436  		t.Fatalf("read != write")
   437  	}
   438  }
   439  
   440  func TestSeekToRowDictReadSecond(t *testing.T) {
   441  	type rowType struct {
   442  		Name utf8string `parquet:",dict"`
   443  	}
   444  
   445  	// write samples to in-memory buffer
   446  	buf := new(bytes.Buffer)
   447  	schema := parquet.SchemaOf(new(rowType))
   448  	w := parquet.NewWriter(buf, schema)
   449  	sample := rowType{
   450  		Name: "foo1",
   451  	}
   452  	// write two rows
   453  	w.Write(sample)
   454  	sample.Name = "foo2"
   455  	w.Write(sample)
   456  	w.Close()
   457  
   458  	// create reader
   459  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   460  
   461  	// read second row
   462  	r.SeekToRow(1)
   463  	row := new(rowType)
   464  	err := r.Read(row)
   465  	if err != nil {
   466  		t.Fatalf("reading row: %v", err)
   467  	}
   468  	// fmt.Println(&sample, row)
   469  	if *row != sample {
   470  		t.Fatalf("read != write")
   471  	}
   472  }
   473  
   474  func TestSeekToRowDictReadMultiplePages(t *testing.T) {
   475  	type rowType struct {
   476  		Name utf8string `parquet:",dict"`
   477  	}
   478  
   479  	// write samples to in-memory buffer
   480  	buf := new(bytes.Buffer)
   481  	schema := parquet.SchemaOf(new(rowType))
   482  	w := parquet.NewWriter(buf, schema, &parquet.WriterConfig{
   483  		PageBufferSize: 10,
   484  	})
   485  	sample := rowType{
   486  		Name: "foo1",
   487  	}
   488  
   489  	// write enough rows to spill over a single page
   490  	for i := 0; i < 10; i++ {
   491  		w.Write(sample)
   492  	}
   493  	sample.Name = "foo2"
   494  	w.Write(sample)
   495  	w.Close()
   496  
   497  	// create reader
   498  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   499  
   500  	// read 11th row
   501  	r.SeekToRow(10)
   502  	row := new(rowType)
   503  	err := r.Read(row)
   504  	if err != nil {
   505  		t.Fatalf("reading row: %v", err)
   506  	}
   507  	if *row != sample {
   508  		t.Fatalf("read != write")
   509  	}
   510  }