github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader_go18_test.go (about)

     1  //go:build go1.18
     2  
     3  package parquet_test
     4  
     5  import (
     6  	"bytes"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"math/rand"
    11  	"os"
    12  	"reflect"
    13  	"testing"
    14  
    15  	"github.com/segmentio/parquet-go"
    16  )
    17  
    18  func TestGenericReader(t *testing.T) {
    19  	testGenericReader[booleanColumn](t)
    20  	testGenericReader[int32Column](t)
    21  	testGenericReader[int64Column](t)
    22  	testGenericReader[int96Column](t)
    23  	testGenericReader[floatColumn](t)
    24  	testGenericReader[doubleColumn](t)
    25  	testGenericReader[byteArrayColumn](t)
    26  	testGenericReader[fixedLenByteArrayColumn](t)
    27  	testGenericReader[stringColumn](t)
    28  	testGenericReader[indexedStringColumn](t)
    29  	testGenericReader[uuidColumn](t)
    30  	testGenericReader[timeColumn](t)
    31  	testGenericReader[timeInMillisColumn](t)
    32  	testGenericReader[mapColumn](t)
    33  	testGenericReader[decimalColumn](t)
    34  	testGenericReader[addressBook](t)
    35  	testGenericReader[contact](t)
    36  	testGenericReader[listColumn2](t)
    37  	testGenericReader[listColumn1](t)
    38  	testGenericReader[listColumn0](t)
    39  	testGenericReader[nestedListColumn1](t)
    40  	testGenericReader[nestedListColumn](t)
    41  	testGenericReader[*contact](t)
    42  	testGenericReader[paddedBooleanColumn](t)
    43  	testGenericReader[optionalInt32Column](t)
    44  	testGenericReader[repeatedInt32Column](t)
    45  }
    46  
    47  func testGenericReader[Row any](t *testing.T) {
    48  	var model Row
    49  	t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) {
    50  		err := quickCheck(func(rows []Row) bool {
    51  			if len(rows) == 0 {
    52  				return true // TODO: fix support for parquet files with zero rows
    53  			}
    54  			if err := testGenericReaderRows(rows); err != nil {
    55  				t.Error(err)
    56  				return false
    57  			}
    58  			return true
    59  		})
    60  		if err != nil {
    61  			t.Error(err)
    62  		}
    63  	})
    64  }
    65  
    66  func testGenericReaderRows[Row any](rows []Row) error {
    67  	setNullPointers(rows)
    68  	buffer := new(bytes.Buffer)
    69  	writer := parquet.NewGenericWriter[Row](buffer)
    70  	_, err := writer.Write(rows)
    71  	if err != nil {
    72  		return err
    73  	}
    74  	if err := writer.Close(); err != nil {
    75  		return err
    76  	}
    77  	reader := parquet.NewGenericReader[Row](bytes.NewReader(buffer.Bytes()))
    78  	result := make([]Row, len(rows))
    79  	n, err := reader.Read(result)
    80  	if err != nil && !errors.Is(err, io.EOF) {
    81  		return err
    82  	}
    83  	if n < len(rows) {
    84  		return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n)
    85  	}
    86  	if !reflect.DeepEqual(rows, result) {
    87  		return fmt.Errorf("rows mismatch:\nwant: %+v\ngot: %+v", rows, result)
    88  	}
    89  	return nil
    90  }
    91  
    92  func TestIssue400(t *testing.T) {
    93  	type B struct {
    94  		Name string
    95  	}
    96  	type A struct {
    97  		B []B `parquet:",optional"`
    98  	}
    99  
   100  	b := new(bytes.Buffer)
   101  	w := parquet.NewGenericWriter[A](b)
   102  	expect := []A{
   103  		{
   104  			B: []B{
   105  				{
   106  					// 32 bytes random so we can see in the binary parquet if we
   107  					// actually wrote the value
   108  					Name: "9e7eb1f0-bbcc-43ec-bfad-a9fac1bb0feb",
   109  				},
   110  			},
   111  		},
   112  	}
   113  	_, err := w.Write(expect)
   114  	if err != nil {
   115  		t.Fatal(err)
   116  	}
   117  	if err = w.Close(); err != nil {
   118  		t.Fatal(err)
   119  	}
   120  
   121  	r := parquet.NewGenericReader[A](bytes.NewReader(b.Bytes()))
   122  	values := make([]A, 1)
   123  	_, err = r.Read(values)
   124  	if err != nil {
   125  		t.Fatal(err)
   126  	}
   127  	if !reflect.DeepEqual(expect[0], values[0]) {
   128  		t.Errorf("want %q got %q", values[0], expect[0])
   129  	}
   130  }
   131  
   132  func TestReadMinPageSize(t *testing.T) {
   133  	// NOTE: min page size is 307 for MyRow schema
   134  	t.Run("test read less than min page size", func(t *testing.T) { testReadMinPageSize(128, t) })
   135  	t.Run("test read equal to min page size", func(t *testing.T) { testReadMinPageSize(307, t) })
   136  	t.Run("test read more than min page size", func(t *testing.T) { testReadMinPageSize(384, t) })
   137  	// NOTE: num rows is 20,000
   138  	t.Run("test read equal to num rows", func(t *testing.T) { testReadMinPageSize(20_000, t) })
   139  	t.Run("test read more than num rows", func(t *testing.T) { testReadMinPageSize(25_000, t) })
   140  }
   141  
   142  func testReadMinPageSize(readSize int, t *testing.T) {
   143  	type MyRow struct {
   144  		ID    [16]byte `parquet:"id,delta,uuid"`
   145  		File  string   `parquet:"file,dict,zstd"`
   146  		Index int64    `parquet:"index,delta,zstd"`
   147  	}
   148  
   149  	numRows := 20_000
   150  	maxPageBytes := 5000
   151  
   152  	tmp, err := os.CreateTemp("/tmp", "*.parquet")
   153  	if err != nil {
   154  		t.Fatal("os.CreateTemp: ", err)
   155  	}
   156  	path := tmp.Name()
   157  	defer os.Remove(path)
   158  	t.Log("file:", path)
   159  
   160  	// The page buffer size ensures we get multiple pages out of this example.
   161  	w := parquet.NewGenericWriter[MyRow](tmp, parquet.PageBufferSize(maxPageBytes))
   162  	// Need to write 1 row at a time here as writing many at once disregards PageBufferSize option.
   163  	for i := 0; i < numRows; i++ {
   164  		row := MyRow{
   165  			ID:    [16]byte{15: byte(i)},
   166  			File:  "hi" + fmt.Sprint(i),
   167  			Index: int64(i),
   168  		}
   169  		_, err := w.Write([]MyRow{row})
   170  		if err != nil {
   171  			t.Fatal("w.Write: ", err)
   172  		}
   173  		// Flush writes rows as row group. 4 total (20k/5k) in this file.
   174  		if (i+1)%maxPageBytes == 0 {
   175  			err = w.Flush()
   176  			if err != nil {
   177  				t.Fatal("w.Flush: ", err)
   178  			}
   179  		}
   180  	}
   181  	err = w.Close()
   182  	if err != nil {
   183  		t.Fatal("w.Close: ", err)
   184  	}
   185  	err = tmp.Close()
   186  	if err != nil {
   187  		t.Fatal("tmp.Close: ", err)
   188  	}
   189  
   190  	file, err := os.Open(path)
   191  	if err != nil {
   192  		t.Fatal("os.Open", err)
   193  	}
   194  	reader := parquet.NewGenericReader[MyRow](file)
   195  	read := int64(0)
   196  	nRows := reader.NumRows()
   197  	rows := make([]MyRow, 0, nRows)
   198  	buf := make([]MyRow, readSize) // NOTE: min page size is 307 for MyRow schema
   199  
   200  	for read < nRows {
   201  		num, err := reader.Read(buf)
   202  		read += int64(num)
   203  		if err != nil && !errors.Is(err, io.EOF) {
   204  			t.Fatal("Read:", err)
   205  		}
   206  		rows = append(rows, buf...)
   207  	}
   208  
   209  	if err := reader.Close(); err != nil {
   210  		t.Fatal("Close", err)
   211  	}
   212  
   213  	if len(rows) < numRows {
   214  		t.Fatalf("not enough values were read: want=%d got=%d", len(rows), numRows)
   215  	}
   216  	for i, row := range rows[:numRows] {
   217  		id := [16]byte{15: byte(i)}
   218  		file := "hi" + fmt.Sprint(i)
   219  		index := int64(i)
   220  
   221  		if row.ID != id || row.File != file || row.Index != index {
   222  			t.Fatalf("rows mismatch at index: %d got: %+v", i, row)
   223  		}
   224  	}
   225  }
   226  
   227  func BenchmarkGenericReader(b *testing.B) {
   228  	benchmarkGenericReader[benchmarkRowType](b)
   229  	benchmarkGenericReader[booleanColumn](b)
   230  	benchmarkGenericReader[int32Column](b)
   231  	benchmarkGenericReader[int64Column](b)
   232  	benchmarkGenericReader[floatColumn](b)
   233  	benchmarkGenericReader[doubleColumn](b)
   234  	benchmarkGenericReader[byteArrayColumn](b)
   235  	benchmarkGenericReader[fixedLenByteArrayColumn](b)
   236  	benchmarkGenericReader[stringColumn](b)
   237  	benchmarkGenericReader[indexedStringColumn](b)
   238  	benchmarkGenericReader[uuidColumn](b)
   239  	benchmarkGenericReader[timeColumn](b)
   240  	benchmarkGenericReader[timeInMillisColumn](b)
   241  	benchmarkGenericReader[mapColumn](b)
   242  	benchmarkGenericReader[decimalColumn](b)
   243  	benchmarkGenericReader[contact](b)
   244  	benchmarkGenericReader[paddedBooleanColumn](b)
   245  	benchmarkGenericReader[optionalInt32Column](b)
   246  }
   247  
   248  func benchmarkGenericReader[Row generator[Row]](b *testing.B) {
   249  	var model Row
   250  	b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) {
   251  		prng := rand.New(rand.NewSource(0))
   252  		rows := make([]Row, benchmarkNumRows)
   253  		for i := range rows {
   254  			rows[i] = rows[i].generate(prng)
   255  		}
   256  
   257  		rowbuf := make([]Row, benchmarkRowsPerStep)
   258  		buffer := parquet.NewGenericBuffer[Row]()
   259  		buffer.Write(rows)
   260  
   261  		b.Run("go1.17", func(b *testing.B) {
   262  			reader := parquet.NewRowGroupReader(buffer)
   263  			benchmarkRowsPerSecond(b, func() int {
   264  				for i := range rowbuf {
   265  					if err := reader.Read(&rowbuf[i]); err != nil {
   266  						if err != io.EOF {
   267  							b.Fatal(err)
   268  						} else {
   269  							reader.Reset()
   270  						}
   271  					}
   272  				}
   273  				return len(rowbuf)
   274  			})
   275  		})
   276  
   277  		b.Run("go1.18", func(b *testing.B) {
   278  			reader := parquet.NewGenericRowGroupReader[Row](buffer)
   279  			benchmarkRowsPerSecond(b, func() int {
   280  				n, err := reader.Read(rowbuf)
   281  				if err != nil {
   282  					if err != io.EOF {
   283  						b.Fatal(err)
   284  					} else {
   285  						reader.Reset()
   286  					}
   287  				}
   288  				return n
   289  			})
   290  		})
   291  	})
   292  }