github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/reader_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  	"math/rand"
     9  	"reflect"
    10  	"testing"
    11  
    12  	"github.com/vc42/parquet-go"
    13  	"github.com/vc42/parquet-go/internal/quick"
    14  )
    15  
    16  func rowsOf(numRows int, model interface{}) rows {
    17  	prng := rand.New(rand.NewSource(0))
    18  	return randomRowsOf(prng, numRows, model)
    19  }
    20  
    21  func randomRowsOf(prng *rand.Rand, numRows int, model interface{}) rows {
    22  	typ := reflect.TypeOf(model)
    23  	rows := make(rows, numRows)
    24  	makeValue := quick.MakeValueFuncOf(typ)
    25  	for i := range rows {
    26  		v := reflect.New(typ).Elem()
    27  		makeValue(v, prng)
    28  		rows[i] = v.Interface()
    29  	}
    30  	return rows
    31  }
    32  
    33  var readerTests = []struct {
    34  	scenario string
    35  	model    interface{}
    36  }{
    37  	{
    38  		scenario: "BOOLEAN",
    39  		model:    booleanColumn{},
    40  	},
    41  
    42  	{
    43  		scenario: "INT32",
    44  		model:    int32Column{},
    45  	},
    46  
    47  	{
    48  		scenario: "INT64",
    49  		model:    int64Column{},
    50  	},
    51  
    52  	{
    53  		scenario: "INT96",
    54  		model:    int96Column{},
    55  	},
    56  
    57  	{
    58  		scenario: "FLOAT",
    59  		model:    floatColumn{},
    60  	},
    61  
    62  	{
    63  		scenario: "DOUBLE",
    64  		model:    doubleColumn{},
    65  	},
    66  
    67  	{
    68  		scenario: "BYTE_ARRAY",
    69  		model:    byteArrayColumn{},
    70  	},
    71  
    72  	{
    73  		scenario: "FIXED_LEN_BYTE_ARRAY",
    74  		model:    fixedLenByteArrayColumn{},
    75  	},
    76  
    77  	{
    78  		scenario: "STRING",
    79  		model:    stringColumn{},
    80  	},
    81  
    82  	{
    83  		scenario: "STRING (dict)",
    84  		model:    indexedStringColumn{},
    85  	},
    86  
    87  	{
    88  		scenario: "UUID",
    89  		model:    uuidColumn{},
    90  	},
    91  
    92  	{
    93  		scenario: "DECIMAL",
    94  		model:    decimalColumn{},
    95  	},
    96  
    97  	{
    98  		scenario: "AddressBook",
    99  		model:    addressBook{},
   100  	},
   101  
   102  	{
   103  		scenario: "one optional level",
   104  		model:    listColumn2{},
   105  	},
   106  
   107  	{
   108  		scenario: "one repeated level",
   109  		model:    listColumn1{},
   110  	},
   111  
   112  	{
   113  		scenario: "two repeated levels",
   114  		model:    listColumn0{},
   115  	},
   116  
   117  	{
   118  		scenario: "three repeated levels",
   119  		model:    listColumn0{},
   120  	},
   121  
   122  	{
   123  		scenario: "nested lists",
   124  		model:    nestedListColumn{},
   125  	},
   126  
   127  	{
   128  		scenario: "key-value pairs",
   129  		model: struct {
   130  			KeyValuePairs map[utf8string]utf8string
   131  		}{},
   132  	},
   133  
   134  	{
   135  		scenario: "multiple key-value pairs",
   136  		model: struct {
   137  			KeyValuePairs0 map[utf8string]utf8string
   138  			KeyValuePairs1 map[utf8string]utf8string
   139  			KeyValuePairs2 map[utf8string]utf8string
   140  		}{},
   141  	},
   142  
   143  	{
   144  		scenario: "repeated key-value pairs",
   145  		model: struct {
   146  			RepeatedKeyValuePairs []map[utf8string]utf8string
   147  		}{},
   148  	},
   149  
   150  	{
   151  		scenario: "map of repeated values",
   152  		model: struct {
   153  			MapOfRepeated map[utf8string][]utf8string
   154  		}{},
   155  	},
   156  }
   157  
   158  func TestReader(t *testing.T) {
   159  	buf := new(bytes.Buffer)
   160  	file := bytes.NewReader(nil)
   161  
   162  	for _, test := range readerTests {
   163  		t.Run(test.scenario, func(t *testing.T) {
   164  			const N = 42
   165  
   166  			rowType := reflect.TypeOf(test.model)
   167  			rowPtr := reflect.New(rowType)
   168  			rowZero := reflect.Zero(rowType)
   169  			rowValue := rowPtr.Elem()
   170  
   171  			for n := 1; n < N; n++ {
   172  				t.Run(fmt.Sprintf("N=%d", n), func(t *testing.T) {
   173  					defer buf.Reset()
   174  					rows := rowsOf(n, test.model)
   175  
   176  					if err := writeParquetFileWithBuffer(buf, rows); err != nil {
   177  						t.Fatal(err)
   178  					}
   179  
   180  					file.Reset(buf.Bytes())
   181  					r := parquet.NewReader(file, parquet.SchemaOf(test.model))
   182  
   183  					for i, v := range rows {
   184  						if err := r.Read(rowPtr.Interface()); err != nil {
   185  							t.Fatal(err)
   186  						}
   187  						if !reflect.DeepEqual(rowValue.Interface(), v) {
   188  							t.Errorf("row mismatch at index %d\nwant = %+v\ngot  = %+v", i, v, rowValue.Interface())
   189  						}
   190  						rowValue.Set(rowZero)
   191  					}
   192  
   193  					if err := r.Read(rowPtr.Interface()); err != io.EOF {
   194  						t.Errorf("expected EOF after reading all values but got: %v", err)
   195  					}
   196  				})
   197  			}
   198  		})
   199  	}
   200  }
   201  
   202  func BenchmarkReaderReadType(b *testing.B) {
   203  	buf := new(bytes.Buffer)
   204  	file := bytes.NewReader(nil)
   205  
   206  	for _, test := range readerTests {
   207  		b.Run(test.scenario, func(b *testing.B) {
   208  			defer buf.Reset()
   209  			rows := rowsOf(benchmarkNumRows, test.model)
   210  
   211  			if err := writeParquetFile(buf, rows); err != nil {
   212  				b.Fatal(err)
   213  			}
   214  			file.Reset(buf.Bytes())
   215  			f, err := parquet.OpenFile(file, file.Size())
   216  			if err != nil {
   217  				b.Fatal(err)
   218  			}
   219  
   220  			rowType := reflect.TypeOf(test.model)
   221  			rowPtr := reflect.New(rowType)
   222  			rowZero := reflect.Zero(rowType)
   223  			rowValue := rowPtr.Elem()
   224  
   225  			r := parquet.NewReader(f)
   226  			p := rowPtr.Interface()
   227  
   228  			benchmarkRowsPerSecond(b, func() (n int) {
   229  				for i := 0; i < benchmarkRowsPerStep; i++ {
   230  					if err := r.Read(p); err != nil {
   231  						if err == io.EOF {
   232  							r.Reset()
   233  						} else {
   234  							b.Fatal(err)
   235  						}
   236  					}
   237  				}
   238  				rowValue.Set(rowZero)
   239  				return benchmarkRowsPerStep
   240  			})
   241  
   242  			b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows)))
   243  		})
   244  	}
   245  }
   246  
   247  func BenchmarkReaderReadRow(b *testing.B) {
   248  	buf := new(bytes.Buffer)
   249  	file := bytes.NewReader(nil)
   250  
   251  	for _, test := range readerTests {
   252  		b.Run(test.scenario, func(b *testing.B) {
   253  			defer buf.Reset()
   254  			rows := rowsOf(benchmarkNumRows, test.model)
   255  
   256  			if err := writeParquetFile(buf, rows); err != nil {
   257  				b.Fatal(err)
   258  			}
   259  			file.Reset(buf.Bytes())
   260  			f, err := parquet.OpenFile(file, file.Size())
   261  			if err != nil {
   262  				b.Fatal(err)
   263  			}
   264  
   265  			r := parquet.NewReader(f)
   266  			rowbuf := make([]parquet.Row, benchmarkRowsPerStep)
   267  
   268  			benchmarkRowsPerSecond(b, func() int {
   269  				n, err := r.ReadRows(rowbuf)
   270  				if err != nil {
   271  					if err == io.EOF {
   272  						r.Reset()
   273  					} else {
   274  						b.Fatal(err)
   275  					}
   276  				}
   277  				return n
   278  			})
   279  
   280  			b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows)))
   281  		})
   282  	}
   283  }
   284  
   285  func TestReaderReadSubset(t *testing.T) {
   286  	// In this example we'll write 3 columns to the file - X, Y, and Z, but
   287  	// we'll only read out the X and Y columns. Returns true if all writes
   288  	// and reads were successful, and false otherwise.
   289  	type Point3D struct{ X, Y, Z int64 }
   290  	type Point2D struct{ X, Y int64 }
   291  
   292  	err := quickCheck(func(points3D []Point3D) bool {
   293  		if len(points3D) == 0 {
   294  			return true
   295  		}
   296  		buf := new(bytes.Buffer)
   297  		err := writeParquetFile(buf, makeRows(points3D))
   298  		if err != nil {
   299  			t.Error(err)
   300  			return false
   301  		}
   302  		reader := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   303  		for i := 0; ; i++ {
   304  			row := Point2D{}
   305  			err := reader.Read(&row)
   306  			if err != nil {
   307  				if err == io.EOF && i == len(points3D) {
   308  					break
   309  				}
   310  				t.Error(err)
   311  				return false
   312  			}
   313  			if row != (Point2D{X: points3D[i].X, Y: points3D[i].Y}) {
   314  				t.Errorf("points mismatch at row index %d: want=%v got=%v", i, points3D[i], row)
   315  				return false
   316  			}
   317  		}
   318  		return true
   319  	})
   320  	if err != nil {
   321  		t.Error(err)
   322  	}
   323  }
   324  
   325  func TestReaderSeekToRow(t *testing.T) {
   326  	type rowType struct {
   327  		Name utf8string `parquet:",dict"`
   328  	}
   329  
   330  	rows := rowsOf(10, rowType{})
   331  	buf := new(bytes.Buffer)
   332  	err := writeParquetFile(buf, rows)
   333  	if err != nil {
   334  		t.Fatal(err)
   335  	}
   336  
   337  	reader := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   338  	for i := 0; i < 10; i++ {
   339  		if err := reader.SeekToRow(int64(i)); err != nil {
   340  			t.Fatalf("seek to row %d: %v", i, err)
   341  		}
   342  
   343  		row := new(rowType)
   344  		err := reader.Read(row)
   345  		if err != nil {
   346  			t.Fatalf("reading row %d: %v", i, err)
   347  		}
   348  
   349  		if *row != rows[i] {
   350  			t.Fatalf("row %d mismatch: got=%+v want=%+v", i, *row, rows[i])
   351  		}
   352  	}
   353  }
   354  
   355  func TestSeekToRowNoDict(t *testing.T) {
   356  	type rowType struct {
   357  		Name utf8string `parquet:","` // no dictionary encoding
   358  	}
   359  
   360  	// write samples to in-memory buffer
   361  	buf := new(bytes.Buffer)
   362  	schema := parquet.SchemaOf(new(rowType))
   363  	w := parquet.NewWriter(buf, schema)
   364  	sample := rowType{
   365  		Name: "foo1",
   366  	}
   367  	// write two rows
   368  	w.Write(sample)
   369  	sample.Name = "foo2"
   370  	w.Write(sample)
   371  	w.Close()
   372  
   373  	// create reader
   374  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   375  
   376  	// read second row
   377  	r.SeekToRow(1)
   378  	row := new(rowType)
   379  	err := r.Read(row)
   380  	if err != nil {
   381  		t.Fatalf("reading row: %v", err)
   382  	}
   383  	// fmt.Println(&sample, row)
   384  	if *row != sample {
   385  		t.Fatalf("read != write")
   386  	}
   387  }
   388  
   389  func TestSeekToRowReadAll(t *testing.T) {
   390  	type rowType struct {
   391  		Name utf8string `parquet:",dict"`
   392  	}
   393  
   394  	// write samples to in-memory buffer
   395  	buf := new(bytes.Buffer)
   396  	schema := parquet.SchemaOf(new(rowType))
   397  	w := parquet.NewWriter(buf, schema)
   398  	sample := rowType{
   399  		Name: "foo1",
   400  	}
   401  	// write two rows
   402  	w.Write(sample)
   403  	sample.Name = "foo2"
   404  	w.Write(sample)
   405  	w.Close()
   406  
   407  	// create reader
   408  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   409  
   410  	// read first row
   411  	r.SeekToRow(0)
   412  	row := new(rowType)
   413  	err := r.Read(row)
   414  	if err != nil {
   415  		t.Fatalf("reading row: %v", err)
   416  	}
   417  	// read second row
   418  	r.SeekToRow(1)
   419  	row = new(rowType)
   420  	err = r.Read(row)
   421  	if err != nil {
   422  		t.Fatalf("reading row: %v", err)
   423  	}
   424  	// fmt.Println(&sample, row)
   425  	if *row != sample {
   426  		t.Fatalf("read != write")
   427  	}
   428  }
   429  
   430  func TestSeekToRowDictReadSecond(t *testing.T) {
   431  	type rowType struct {
   432  		Name utf8string `parquet:",dict"`
   433  	}
   434  
   435  	// write samples to in-memory buffer
   436  	buf := new(bytes.Buffer)
   437  	schema := parquet.SchemaOf(new(rowType))
   438  	w := parquet.NewWriter(buf, schema)
   439  	sample := rowType{
   440  		Name: "foo1",
   441  	}
   442  	// write two rows
   443  	w.Write(sample)
   444  	sample.Name = "foo2"
   445  	w.Write(sample)
   446  	w.Close()
   447  
   448  	// create reader
   449  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   450  
   451  	// read second row
   452  	r.SeekToRow(1)
   453  	row := new(rowType)
   454  	err := r.Read(row)
   455  	if err != nil {
   456  		t.Fatalf("reading row: %v", err)
   457  	}
   458  	// fmt.Println(&sample, row)
   459  	if *row != sample {
   460  		t.Fatalf("read != write")
   461  	}
   462  }
   463  
   464  func TestSeekToRowDictReadMultiplePages(t *testing.T) {
   465  	type rowType struct {
   466  		Name utf8string `parquet:",dict"`
   467  	}
   468  
   469  	// write samples to in-memory buffer
   470  	buf := new(bytes.Buffer)
   471  	schema := parquet.SchemaOf(new(rowType))
   472  	w := parquet.NewWriter(buf, schema, &parquet.WriterConfig{
   473  		PageBufferSize: 10,
   474  	})
   475  	sample := rowType{
   476  		Name: "foo1",
   477  	}
   478  
   479  	// write enough rows to spill over a single page
   480  	for i := 0; i < 10; i++ {
   481  		w.Write(sample)
   482  	}
   483  	sample.Name = "foo2"
   484  	w.Write(sample)
   485  	w.Close()
   486  
   487  	// create reader
   488  	r := parquet.NewReader(bytes.NewReader(buf.Bytes()))
   489  
   490  	// read 11th row
   491  	r.SeekToRow(10)
   492  	row := new(rowType)
   493  	err := r.Read(row)
   494  	if err != nil {
   495  		t.Fatalf("reading row: %v", err)
   496  	}
   497  	if *row != sample {
   498  		t.Fatalf("read != write")
   499  	}
   500  }