github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/parquet_go18_test.go (about)

     1  //go:build go1.18
     2  
     3  package parquet_test
     4  
     5  import (
     6  	"bytes"
     7  	"fmt"
     8  	"io"
     9  	"log"
    10  	"os"
    11  	"reflect"
    12  	"testing"
    13  
    14  	"github.com/segmentio/parquet-go"
    15  	"google.golang.org/protobuf/types/known/structpb"
    16  )
    17  
    18  func ExampleReadFile() {
    19  	type Row struct {
    20  		ID   int64  `parquet:"id"`
    21  		Name string `parquet:"name,zstd"`
    22  	}
    23  
    24  	ExampleWriteFile()
    25  
    26  	rows, err := parquet.ReadFile[Row]("/tmp/file.parquet")
    27  	if err != nil {
    28  		log.Fatal(err)
    29  	}
    30  
    31  	for _, row := range rows {
    32  		fmt.Printf("%d: %q\n", row.ID, row.Name)
    33  	}
    34  
    35  	// Output:
    36  	// 0: "Bob"
    37  	// 1: "Alice"
    38  	// 2: "Franky"
    39  }
    40  
    41  func ExampleWriteFile() {
    42  	type Row struct {
    43  		ID   int64  `parquet:"id"`
    44  		Name string `parquet:"name,zstd"`
    45  	}
    46  
    47  	if err := parquet.WriteFile("/tmp/file.parquet", []Row{
    48  		{ID: 0, Name: "Bob"},
    49  		{ID: 1, Name: "Alice"},
    50  		{ID: 2, Name: "Franky"},
    51  	}); err != nil {
    52  		log.Fatal(err)
    53  	}
    54  
    55  	// Output:
    56  }
    57  
    58  func ExampleRead_any() {
    59  	type Row struct{ FirstName, LastName string }
    60  
    61  	buf := new(bytes.Buffer)
    62  	err := parquet.Write(buf, []Row{
    63  		{FirstName: "Luke", LastName: "Skywalker"},
    64  		{FirstName: "Han", LastName: "Solo"},
    65  		{FirstName: "R2", LastName: "D2"},
    66  	})
    67  	if err != nil {
    68  		log.Fatal(err)
    69  	}
    70  
    71  	file := bytes.NewReader(buf.Bytes())
    72  
    73  	rows, err := parquet.Read[any](file, file.Size())
    74  	if err != nil {
    75  		log.Fatal(err)
    76  	}
    77  
    78  	for _, row := range rows {
    79  		fmt.Printf("%q\n", row)
    80  	}
    81  
    82  	// Output:
    83  	// map["FirstName":"Luke" "LastName":"Skywalker"]
    84  	// map["FirstName":"Han" "LastName":"Solo"]
    85  	// map["FirstName":"R2" "LastName":"D2"]
    86  }
    87  
    88  func ExampleWrite_any() {
    89  	schema := parquet.SchemaOf(struct {
    90  		FirstName string
    91  		LastName  string
    92  	}{})
    93  
    94  	buf := new(bytes.Buffer)
    95  	err := parquet.Write[any](
    96  		buf,
    97  		[]any{
    98  			map[string]string{"FirstName": "Luke", "LastName": "Skywalker"},
    99  			map[string]string{"FirstName": "Han", "LastName": "Solo"},
   100  			map[string]string{"FirstName": "R2", "LastName": "D2"},
   101  		},
   102  		schema,
   103  	)
   104  	if err != nil {
   105  		log.Fatal(err)
   106  	}
   107  
   108  	file := bytes.NewReader(buf.Bytes())
   109  
   110  	rows, err := parquet.Read[any](file, file.Size())
   111  	if err != nil {
   112  		log.Fatal(err)
   113  	}
   114  
   115  	for _, row := range rows {
   116  		fmt.Printf("%q\n", row)
   117  	}
   118  
   119  	// Output:
   120  	// map["FirstName":"Luke" "LastName":"Skywalker"]
   121  	// map["FirstName":"Han" "LastName":"Solo"]
   122  	// map["FirstName":"R2" "LastName":"D2"]
   123  }
   124  
   125  func ExampleSearch() {
   126  	type Row struct{ FirstName, LastName string }
   127  
   128  	buf := new(bytes.Buffer)
   129  	// The column being searched should be sorted to avoid a full scan of the
   130  	// column. See the section of the readme on sorting for how to sort on
   131  	// insertion into the parquet file using parquet.SortingColumns
   132  	rows := []Row{
   133  		{FirstName: "C", LastName: "3PO"},
   134  		{FirstName: "Han", LastName: "Solo"},
   135  		{FirstName: "Leia", LastName: "Organa"},
   136  		{FirstName: "Luke", LastName: "Skywalker"},
   137  		{FirstName: "R2", LastName: "D2"},
   138  	}
   139  	// The tiny page buffer size ensures we get multiple pages out of the example above.
   140  	w := parquet.NewGenericWriter[Row](buf, parquet.PageBufferSize(12), parquet.WriteBufferSize(0))
   141  	// Need to write 1 row at a time here as writing many at once disregards PageBufferSize option.
   142  	for _, row := range rows {
   143  		_, err := w.Write([]Row{row})
   144  		if err != nil {
   145  			log.Fatal(err)
   146  		}
   147  	}
   148  	err := w.Close()
   149  	if err != nil {
   150  		log.Fatal(err)
   151  	}
   152  
   153  	reader := bytes.NewReader(buf.Bytes())
   154  	file, err := parquet.OpenFile(reader, reader.Size())
   155  	if err != nil {
   156  		log.Fatal(err)
   157  	}
   158  
   159  	// Search is scoped to a single RowGroup/ColumnChunk
   160  	rowGroup := file.RowGroups()[0]
   161  	firstNameColChunk := rowGroup.ColumnChunks()[0]
   162  
   163  	found := parquet.Search(firstNameColChunk.ColumnIndex(), parquet.ValueOf("Luke"), parquet.ByteArrayType)
   164  	offsetIndex := firstNameColChunk.OffsetIndex()
   165  	fmt.Printf("numPages: %d\n", offsetIndex.NumPages())
   166  	fmt.Printf("result found in page: %d\n", found)
   167  	if found < offsetIndex.NumPages() {
   168  		r := parquet.NewGenericReader[Row](file)
   169  		defer r.Close()
   170  		// Seek to the first row in the page the result was found
   171  		r.SeekToRow(offsetIndex.FirstRowIndex(found))
   172  		result := make([]Row, 2)
   173  		_, _ = r.Read(result)
   174  		// Leia is in index 0 for the page.
   175  		for _, row := range result {
   176  			if row.FirstName == "Luke" {
   177  				fmt.Printf("%q\n", row)
   178  			}
   179  		}
   180  	}
   181  
   182  	// Output:
   183  	// numPages: 3
   184  	// result found in page: 1
   185  	// {"Luke" "Skywalker"}
   186  }
   187  
   188  func TestIssue360(t *testing.T) {
   189  	type TestType struct {
   190  		Key []int
   191  	}
   192  
   193  	schema := parquet.SchemaOf(TestType{})
   194  	buffer := parquet.NewGenericBuffer[any](schema)
   195  
   196  	data := make([]any, 1)
   197  	data[0] = TestType{Key: []int{1}}
   198  	_, err := buffer.Write(data)
   199  	if err != nil {
   200  		fmt.Println("Exiting with error: ", err)
   201  		return
   202  	}
   203  
   204  	var out bytes.Buffer
   205  	writer := parquet.NewGenericWriter[any](&out, schema)
   206  
   207  	_, err = parquet.CopyRows(writer, buffer.Rows())
   208  	if err != nil {
   209  		fmt.Println("Exiting with error: ", err)
   210  		return
   211  	}
   212  	writer.Close()
   213  
   214  	br := bytes.NewReader(out.Bytes())
   215  	rows, _ := parquet.Read[any](br, br.Size())
   216  
   217  	expect := []any{
   218  		map[string]any{
   219  			"Key": []any{
   220  				int64(1),
   221  			},
   222  		},
   223  	}
   224  
   225  	assertRowsEqual(t, expect, rows)
   226  }
   227  
   228  func TestIssue362ParquetReadFromGenericReaders(t *testing.T) {
   229  	path := "testdata/dms_test_table_LOAD00000001.parquet"
   230  	fp, err := os.Open(path)
   231  	if err != nil {
   232  		t.Fatal(err)
   233  	}
   234  	defer fp.Close()
   235  
   236  	r1 := parquet.NewGenericReader[any](fp)
   237  	rows1 := make([]any, r1.NumRows())
   238  	_, err = r1.Read(rows1)
   239  	if err != nil && err != io.EOF {
   240  		t.Fatal(err)
   241  	}
   242  
   243  	r2 := parquet.NewGenericReader[any](fp)
   244  	rows2 := make([]any, r2.NumRows())
   245  	_, err = r2.Read(rows2)
   246  	if err != nil && err != io.EOF {
   247  		t.Fatal(err)
   248  	}
   249  }
   250  
   251  func TestIssue362ParquetReadFile(t *testing.T) {
   252  	rows1, err := parquet.ReadFile[any]("testdata/dms_test_table_LOAD00000001.parquet")
   253  	if err != nil {
   254  		t.Fatal(err)
   255  	}
   256  
   257  	rows2, err := parquet.ReadFile[any]("testdata/dms_test_table_LOAD00000001.parquet")
   258  	if err != nil {
   259  		t.Fatal(err)
   260  	}
   261  
   262  	assertRowsEqual(t, rows1, rows2)
   263  }
   264  
   265  func TestIssue368(t *testing.T) {
   266  	f, err := os.Open("testdata/issue368.parquet")
   267  	if err != nil {
   268  		t.Fatal(err)
   269  	}
   270  	defer f.Close()
   271  
   272  	info, err := f.Stat()
   273  	if err != nil {
   274  		t.Fatal(err)
   275  	}
   276  
   277  	pf, err := parquet.OpenFile(f, info.Size())
   278  	if err != nil {
   279  		t.Fatal(err)
   280  	}
   281  
   282  	reader := parquet.NewGenericReader[any](pf)
   283  	defer reader.Close()
   284  
   285  	trs := make([]any, 1)
   286  	for {
   287  		_, err := reader.Read(trs)
   288  		if err != nil {
   289  			break
   290  		}
   291  	}
   292  }
   293  
   294  func TestIssue377(t *testing.T) {
   295  	type People struct {
   296  		Name string
   297  		Age  int
   298  	}
   299  
   300  	type Nested struct {
   301  		P  []People
   302  		F  string
   303  		GF string
   304  	}
   305  	row1 := Nested{P: []People{
   306  		{
   307  			Name: "Bob",
   308  			Age:  10,
   309  		}}}
   310  	ods := []Nested{
   311  		row1,
   312  	}
   313  	buf := new(bytes.Buffer)
   314  	w := parquet.NewGenericWriter[Nested](buf)
   315  	_, err := w.Write(ods)
   316  	if err != nil {
   317  		t.Fatal("write error: ", err)
   318  	}
   319  	w.Close()
   320  
   321  	file := bytes.NewReader(buf.Bytes())
   322  	rows, err := parquet.Read[Nested](file, file.Size())
   323  	if err != nil {
   324  		t.Fatal("read error: ", err)
   325  	}
   326  
   327  	assertRowsEqual(t, rows, ods)
   328  }
   329  
   330  func TestIssue423(t *testing.T) {
   331  	type Inner struct {
   332  		Value string `parquet:","`
   333  	}
   334  	type Outer struct {
   335  		Label string  `parquet:","`
   336  		Inner Inner   `parquet:",json"`
   337  		Slice []Inner `parquet:",json"`
   338  		// This is the only tricky situation. Because we're delegating to json Marshaler/Unmarshaler
   339  		// We use the json tags for optionality.
   340  		Ptr *Inner `json:",omitempty" parquet:",json"`
   341  
   342  		// This tests BC behavior that slices of bytes and json strings still get written/read in a BC way.
   343  		String        string                     `parquet:",json"`
   344  		Bytes         []byte                     `parquet:",json"`
   345  		MapOfStructPb map[string]*structpb.Value `parquet:",json"`
   346  		StructPB      *structpb.Value            `parquet:",json"`
   347  	}
   348  
   349  	writeRows := []Outer{
   350  		{
   351  			Label: "welp",
   352  			Inner: Inner{
   353  				Value: "this is a string",
   354  			},
   355  			Slice: []Inner{
   356  				{
   357  					Value: "in a slice",
   358  				},
   359  			},
   360  			Ptr:    nil,
   361  			String: `{"hello":"world"}`,
   362  			Bytes:  []byte(`{"goodbye":"world"}`),
   363  			MapOfStructPb: map[string]*structpb.Value{
   364  				"answer": structpb.NewNumberValue(42.00),
   365  			},
   366  			StructPB: structpb.NewBoolValue(true),
   367  		},
   368  		{
   369  			Label: "foxes",
   370  			Inner: Inner{
   371  				Value: "the quick brown fox jumped over the yellow lazy dog.",
   372  			},
   373  			Slice: []Inner{
   374  				{
   375  					Value: "in a slice",
   376  				},
   377  			},
   378  			Ptr: &Inner{
   379  				Value: "not nil",
   380  			},
   381  			String: `{"hello":"world"}`,
   382  			Bytes:  []byte(`{"goodbye":"world"}`),
   383  			MapOfStructPb: map[string]*structpb.Value{
   384  				"doubleAnswer": structpb.NewNumberValue(84.00),
   385  			},
   386  			StructPB: structpb.NewBoolValue(false),
   387  		},
   388  	}
   389  
   390  	schema := parquet.SchemaOf(new(Outer))
   391  	fmt.Println(schema.String())
   392  	buf := new(bytes.Buffer)
   393  	w := parquet.NewGenericWriter[Outer](buf, schema)
   394  	_, err := w.Write(writeRows)
   395  	if err != nil {
   396  		t.Fatal("write error: ", err)
   397  	}
   398  	w.Close()
   399  
   400  	file := bytes.NewReader(buf.Bytes())
   401  	readRows, err := parquet.Read[Outer](file, file.Size())
   402  	if err != nil {
   403  		t.Fatal("read error: ", err)
   404  	}
   405  
   406  	assertRowsEqual(t, writeRows, readRows)
   407  }
   408  
   409  func TestReadFileGenericMultipleRowGroupsMultiplePages(t *testing.T) {
   410  	type MyRow struct {
   411  		ID    [16]byte `parquet:"id,delta,uuid"`
   412  		File  string   `parquet:"file,dict,zstd"`
   413  		Index int64    `parquet:"index,delta,zstd"`
   414  	}
   415  
   416  	numRows := 20_000
   417  	maxPageBytes := 5000
   418  
   419  	tmp, err := os.CreateTemp("/tmp", "*.parquet")
   420  	if err != nil {
   421  		t.Fatal("os.CreateTemp: ", err)
   422  	}
   423  	path := tmp.Name()
   424  	defer os.Remove(path)
   425  	t.Log("file:", path)
   426  
   427  	// The page buffer size ensures we get multiple pages out of this example.
   428  	w := parquet.NewGenericWriter[MyRow](tmp, parquet.PageBufferSize(maxPageBytes))
   429  	// Need to write 1 row at a time here as writing many at once disregards PageBufferSize option.
   430  	for i := 0; i < numRows; i++ {
   431  		row := MyRow{
   432  			ID:    [16]byte{15: byte(i)},
   433  			File:  "hi" + fmt.Sprint(i),
   434  			Index: int64(i),
   435  		}
   436  		_, err := w.Write([]MyRow{row})
   437  		if err != nil {
   438  			t.Fatal("w.Write: ", err)
   439  		}
   440  		// Flush writes rows as row group. 4 total (20k/5k) in this file.
   441  		if (i+1)%maxPageBytes == 0 {
   442  			err = w.Flush()
   443  			if err != nil {
   444  				t.Fatal("w.Flush: ", err)
   445  			}
   446  		}
   447  	}
   448  	err = w.Close()
   449  	if err != nil {
   450  		t.Fatal("w.Close: ", err)
   451  	}
   452  	err = tmp.Close()
   453  	if err != nil {
   454  		t.Fatal("tmp.Close: ", err)
   455  	}
   456  
   457  	rows, err := parquet.ReadFile[MyRow](path)
   458  	if err != nil {
   459  		t.Fatal("parquet.ReadFile: ", err)
   460  	}
   461  
   462  	if len(rows) != numRows {
   463  		t.Fatalf("not enough values were read: want=%d got=%d", len(rows), numRows)
   464  	}
   465  	for i, row := range rows {
   466  		id := [16]byte{15: byte(i)}
   467  		file := "hi" + fmt.Sprint(i)
   468  		index := int64(i)
   469  
   470  		if row.ID != id || row.File != file || row.Index != index {
   471  			t.Fatalf("rows mismatch at index: %d got: %+v", i, row)
   472  		}
   473  	}
   474  }
   475  
   476  func assertRowsEqual[T any](t *testing.T, rows1, rows2 []T) {
   477  	if !reflect.DeepEqual(rows1, rows2) {
   478  		t.Error("rows mismatch")
   479  
   480  		t.Log("want:")
   481  		logRows(t, rows1)
   482  
   483  		t.Log("got:")
   484  		logRows(t, rows2)
   485  	}
   486  }
   487  
   488  func logRows[T any](t *testing.T, rows []T) {
   489  	for _, row := range rows {
   490  		t.Logf(". %#v\n", row)
   491  	}
   492  }