github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/page_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"io"
     6  	"reflect"
     7  	"testing"
     8  
     9  	"github.com/parquet-go/parquet-go"
    10  	"github.com/parquet-go/parquet-go/deprecated"
    11  	"github.com/parquet-go/parquet-go/encoding/plain"
    12  	"github.com/parquet-go/parquet-go/internal/unsafecast"
    13  )
    14  
    15  func TestPage(t *testing.T) {
    16  	t.Run("BOOLEAN", testPageBoolean)
    17  	t.Run("INT32", testPageInt32)
    18  	t.Run("INT64", testPageInt64)
    19  	t.Run("INT96", testPageInt96)
    20  	t.Run("FLOAT", testPageFloat)
    21  	t.Run("DOUBLE", testPageDouble)
    22  	t.Run("BYTE_ARRAY", testPageByteArray)
    23  	t.Run("FIXED_LEN_BYTE_ARRAY", testPageFixedLenByteArray)
    24  }
    25  
    26  func testPageBoolean(t *testing.T) {
    27  	schema := parquet.SchemaOf(struct{ Value bool }{})
    28  
    29  	t.Run("parquet", func(t *testing.T) {
    30  		testPage(t, schema, pageTest{
    31  			write: func(w parquet.ValueWriter) (interface{}, error) {
    32  				values := make([]bool, 50_000)
    33  				for i := range values {
    34  					values[i] = i%2 == 0
    35  				}
    36  				n, err := w.(parquet.BooleanWriter).WriteBooleans(values)
    37  				return values[:n], err
    38  			},
    39  
    40  			read: func(r parquet.ValueReader) (interface{}, error) {
    41  				values := make([]bool, 50_000)
    42  				n, err := r.(parquet.BooleanReader).ReadBooleans(values)
    43  				return values[:n], err
    44  			},
    45  		})
    46  	})
    47  }
    48  
    49  func testPageInt32(t *testing.T) {
    50  	schema := parquet.SchemaOf(struct{ Value int32 }{})
    51  
    52  	t.Run("io", func(t *testing.T) {
    53  		testBufferPage(t, schema, pageTest{
    54  			write: func(w parquet.ValueWriter) (interface{}, error) {
    55  				values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    56  				n, err := w.(io.Writer).Write(unsafecast.Int32ToBytes(values))
    57  				return values[:n/4], err
    58  			},
    59  
    60  			read: func(r parquet.ValueReader) (interface{}, error) {
    61  				values := make([]int32, 10)
    62  				n, err := r.(io.Reader).Read(unsafecast.Int32ToBytes(values))
    63  				return values[:n/4], err
    64  			},
    65  		})
    66  	})
    67  
    68  	t.Run("parquet", func(t *testing.T) {
    69  		testPage(t, schema, pageTest{
    70  			write: func(w parquet.ValueWriter) (interface{}, error) {
    71  				values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    72  				n, err := w.(parquet.Int32Writer).WriteInt32s(values)
    73  				return values[:n], err
    74  			},
    75  
    76  			read: func(r parquet.ValueReader) (interface{}, error) {
    77  				values := make([]int32, 10)
    78  				n, err := r.(parquet.Int32Reader).ReadInt32s(values)
    79  				return values[:n], err
    80  			},
    81  		})
    82  	})
    83  }
    84  
    85  func testPageInt64(t *testing.T) {
    86  	schema := parquet.SchemaOf(struct{ Value int64 }{})
    87  
    88  	t.Run("io", func(t *testing.T) {
    89  		testBufferPage(t, schema, pageTest{
    90  			write: func(w parquet.ValueWriter) (interface{}, error) {
    91  				values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    92  				n, err := w.(io.Writer).Write(unsafecast.Int64ToBytes(values))
    93  				return values[:n/8], err
    94  			},
    95  
    96  			read: func(r parquet.ValueReader) (interface{}, error) {
    97  				values := make([]int64, 10)
    98  				n, err := r.(io.Reader).Read(unsafecast.Int64ToBytes(values))
    99  				return values[:n/8], err
   100  			},
   101  		})
   102  	})
   103  
   104  	t.Run("parquet", func(t *testing.T) {
   105  		testPage(t, schema, pageTest{
   106  			write: func(w parquet.ValueWriter) (interface{}, error) {
   107  				values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   108  				n, err := w.(parquet.Int64Writer).WriteInt64s(values)
   109  				return values[:n], err
   110  			},
   111  
   112  			read: func(r parquet.ValueReader) (interface{}, error) {
   113  				values := make([]int64, 10)
   114  				n, err := r.(parquet.Int64Reader).ReadInt64s(values)
   115  				return values[:n], err
   116  			},
   117  		})
   118  	})
   119  }
   120  
   121  func testPageInt96(t *testing.T) {
   122  	schema := parquet.SchemaOf(struct{ Value deprecated.Int96 }{})
   123  
   124  	t.Run("io", func(t *testing.T) {
   125  		testBufferPage(t, schema, pageTest{
   126  			write: func(w parquet.ValueWriter) (interface{}, error) {
   127  				values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}}
   128  				n, err := w.(io.Writer).Write(deprecated.Int96ToBytes(values))
   129  				return values[:n/12], err
   130  			},
   131  
   132  			read: func(r parquet.ValueReader) (interface{}, error) {
   133  				values := make([]deprecated.Int96, 3)
   134  				n, err := r.(io.Reader).Read(deprecated.Int96ToBytes(values))
   135  				return values[:n/12], err
   136  			},
   137  		})
   138  	})
   139  
   140  	t.Run("parquet", func(t *testing.T) {
   141  		testPage(t, schema, pageTest{
   142  			write: func(w parquet.ValueWriter) (interface{}, error) {
   143  				values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}}
   144  				n, err := w.(parquet.Int96Writer).WriteInt96s(values)
   145  				return values[:n], err
   146  			},
   147  
   148  			read: func(r parquet.ValueReader) (interface{}, error) {
   149  				values := make([]deprecated.Int96, 3)
   150  				n, err := r.(parquet.Int96Reader).ReadInt96s(values)
   151  				return values[:n], err
   152  			},
   153  		})
   154  	})
   155  }
   156  
   157  func testPageFloat(t *testing.T) {
   158  	schema := parquet.SchemaOf(struct{ Value float32 }{})
   159  
   160  	t.Run("io", func(t *testing.T) {
   161  		testBufferPage(t, schema, pageTest{
   162  			write: func(w parquet.ValueWriter) (interface{}, error) {
   163  				values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   164  				n, err := w.(io.Writer).Write(unsafecast.Float32ToBytes(values))
   165  				return values[:n/4], err
   166  			},
   167  
   168  			read: func(r parquet.ValueReader) (interface{}, error) {
   169  				values := make([]float32, 10)
   170  				n, err := r.(io.Reader).Read(unsafecast.Float32ToBytes(values))
   171  				return values[:n/4], err
   172  			},
   173  		})
   174  	})
   175  
   176  	t.Run("parquet", func(t *testing.T) {
   177  		testPage(t, schema, pageTest{
   178  			write: func(w parquet.ValueWriter) (interface{}, error) {
   179  				values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   180  				n, err := w.(parquet.FloatWriter).WriteFloats(values)
   181  				return values[:n], err
   182  			},
   183  
   184  			read: func(r parquet.ValueReader) (interface{}, error) {
   185  				values := make([]float32, 10)
   186  				n, err := r.(parquet.FloatReader).ReadFloats(values)
   187  				return values[:n], err
   188  			},
   189  		})
   190  	})
   191  }
   192  
   193  func testPageDouble(t *testing.T) {
   194  	schema := parquet.SchemaOf(struct{ Value float64 }{})
   195  
   196  	t.Run("io", func(t *testing.T) {
   197  		testBufferPage(t, schema, pageTest{
   198  			write: func(w parquet.ValueWriter) (interface{}, error) {
   199  				values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   200  				n, err := w.(io.Writer).Write(unsafecast.Float64ToBytes(values))
   201  				return values[:n/8], err
   202  			},
   203  
   204  			read: func(r parquet.ValueReader) (interface{}, error) {
   205  				values := make([]float64, 10)
   206  				n, err := r.(io.Reader).Read(unsafecast.Float64ToBytes(values))
   207  				return values[:n/8], err
   208  			},
   209  		})
   210  	})
   211  
   212  	t.Run("parquet", func(t *testing.T) {
   213  		testPage(t, schema, pageTest{
   214  			write: func(w parquet.ValueWriter) (interface{}, error) {
   215  				values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   216  				n, err := w.(parquet.DoubleWriter).WriteDoubles(values)
   217  				return values[:n], err
   218  			},
   219  
   220  			read: func(r parquet.ValueReader) (interface{}, error) {
   221  				values := make([]float64, 10)
   222  				n, err := r.(parquet.DoubleReader).ReadDoubles(values)
   223  				return values[:n], err
   224  			},
   225  		})
   226  	})
   227  }
   228  
   229  func testPageByteArray(t *testing.T) {
   230  	schema := parquet.SchemaOf(struct{ Value []byte }{})
   231  
   232  	t.Run("io", func(t *testing.T) {
   233  		testBufferPage(t, schema, pageTest{
   234  			write: func(w parquet.ValueWriter) (interface{}, error) {
   235  				values := []byte{}
   236  				values = plain.AppendByteArray(values, []byte("A"))
   237  				values = plain.AppendByteArray(values, []byte("B"))
   238  				values = plain.AppendByteArray(values, []byte("C"))
   239  				n, err := w.(io.Writer).Write(values)
   240  				return values[:n], err
   241  			},
   242  
   243  			read: func(r parquet.ValueReader) (interface{}, error) {
   244  				values := make([]byte, 3+3*plain.ByteArrayLengthSize)
   245  				n, err := r.(io.Reader).Read(values)
   246  				return values[:n], err
   247  			},
   248  		})
   249  	})
   250  
   251  	t.Run("parquet", func(t *testing.T) {
   252  		testPage(t, schema, pageTest{
   253  			write: func(w parquet.ValueWriter) (interface{}, error) {
   254  				values := []byte{}
   255  				values = plain.AppendByteArray(values, []byte("A"))
   256  				values = plain.AppendByteArray(values, []byte("B"))
   257  				values = plain.AppendByteArray(values, []byte("C"))
   258  				_, err := w.(parquet.ByteArrayWriter).WriteByteArrays(values)
   259  				return values, err
   260  			},
   261  
   262  			read: func(r parquet.ValueReader) (interface{}, error) {
   263  				values := make([]byte, 3+3*plain.ByteArrayLengthSize)
   264  				n, err := r.(parquet.ByteArrayReader).ReadByteArrays(values)
   265  				return values[:n+n*plain.ByteArrayLengthSize], err
   266  			},
   267  		})
   268  	})
   269  }
   270  
   271  func testPageFixedLenByteArray(t *testing.T) {
   272  	schema := parquet.SchemaOf(struct{ Value [3]byte }{})
   273  
   274  	t.Run("io", func(t *testing.T) {
   275  		testBufferPage(t, schema, pageTest{
   276  			write: func(w parquet.ValueWriter) (interface{}, error) {
   277  				values := []byte("123456789")
   278  				n, err := w.(io.Writer).Write(values)
   279  				return values[:n], err
   280  			},
   281  
   282  			read: func(r parquet.ValueReader) (interface{}, error) {
   283  				values := make([]byte, 3*3)
   284  				n, err := r.(io.Reader).Read(values)
   285  				return values[:n], err
   286  			},
   287  		})
   288  	})
   289  
   290  	t.Run("parquet", func(t *testing.T) {
   291  		testPage(t, schema, pageTest{
   292  			write: func(w parquet.ValueWriter) (interface{}, error) {
   293  				values := []byte("123456789")
   294  				n, err := w.(parquet.FixedLenByteArrayWriter).WriteFixedLenByteArrays(values)
   295  				return values[:3*n], err
   296  			},
   297  
   298  			read: func(r parquet.ValueReader) (interface{}, error) {
   299  				values := make([]byte, 3*3)
   300  				n, err := r.(parquet.FixedLenByteArrayReader).ReadFixedLenByteArrays(values)
   301  				return values[:3*n], err
   302  			},
   303  		})
   304  	})
   305  }
   306  
   307  type pageTest struct {
   308  	write func(parquet.ValueWriter) (interface{}, error)
   309  	read  func(parquet.ValueReader) (interface{}, error)
   310  }
   311  
   312  func testPage(t *testing.T, schema *parquet.Schema, test pageTest) {
   313  	t.Run("buffer", func(t *testing.T) { testBufferPage(t, schema, test) })
   314  	t.Run("file", func(t *testing.T) { testFilePage(t, schema, test) })
   315  }
   316  
   317  func testBufferPage(t *testing.T, schema *parquet.Schema, test pageTest) {
   318  	buffer := parquet.NewBuffer(schema)
   319  	column := buffer.ColumnBuffers()[0]
   320  
   321  	w, err := test.write(column)
   322  	if err != nil {
   323  		t.Fatal("writing page values:", err)
   324  	}
   325  
   326  	r, err := test.read(column.Page().Values())
   327  	if err != io.EOF {
   328  		t.Errorf("expected io.EOF after reading all values but got %v", err)
   329  	}
   330  	if !reflect.DeepEqual(w, r) {
   331  		t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w)
   332  	}
   333  }
   334  
   335  func testFilePage(t *testing.T, schema *parquet.Schema, test pageTest) {
   336  	buffer := parquet.NewBuffer(schema)
   337  	column := buffer.ColumnBuffers()[0]
   338  
   339  	w, err := test.write(column)
   340  	if err != nil {
   341  		t.Fatal("writing page values:", err)
   342  	}
   343  
   344  	output := new(bytes.Buffer)
   345  	writer := parquet.NewWriter(output)
   346  	n, err := writer.WriteRowGroup(buffer)
   347  	if err != nil {
   348  		t.Fatal("writing parquet file:", err)
   349  	}
   350  	if err := writer.Close(); err != nil {
   351  		t.Fatal("writing parquet file:", err)
   352  	}
   353  	if n != buffer.NumRows() {
   354  		t.Fatalf("number of rows written mismatch: got=%d want=%d", n, buffer.NumRows())
   355  	}
   356  
   357  	reader := bytes.NewReader(output.Bytes())
   358  	f, err := parquet.OpenFile(reader, reader.Size())
   359  	if err != nil {
   360  		t.Fatal("opening parquet file:", err)
   361  	}
   362  
   363  	pages := f.RowGroups()[0].ColumnChunks()[0].Pages()
   364  	defer pages.Close()
   365  
   366  	p, err := pages.ReadPage()
   367  	if err != nil {
   368  		t.Fatal("reading parquet page:", err)
   369  	}
   370  	defer parquet.Release(p)
   371  
   372  	values := p.Values()
   373  	r, err := test.read(values)
   374  	if err != io.EOF && err != nil {
   375  		t.Errorf("expected io.EOF after reading all values but got %v", err)
   376  	}
   377  	if !reflect.DeepEqual(w, r) {
   378  		t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w)
   379  	}
   380  	if r, err := test.read(values); reflect.ValueOf(r).Len() != 0 || err != io.EOF {
   381  		t.Errorf("expected no data and io.EOF after reading all values but got %d and %v", r, err)
   382  	}
   383  }
   384  
   385  type testStruct struct {
   386  	Value *string
   387  }
   388  
   389  func TestOptionalPageTrailingNulls(t *testing.T) {
   390  	schema := parquet.SchemaOf(&testStruct{})
   391  	buffer := parquet.NewBuffer(schema)
   392  
   393  	str := "test"
   394  	rows := []testStruct{{
   395  		Value: nil,
   396  	}, {
   397  		Value: &str,
   398  	}, {
   399  		Value: nil,
   400  	}}
   401  
   402  	for _, row := range rows {
   403  		_, err := buffer.WriteRows([]parquet.Row{schema.Deconstruct(nil, row)})
   404  		if err != nil {
   405  			t.Fatal("writing row:", err)
   406  		}
   407  	}
   408  
   409  	resultRows := make([]parquet.Row, 0, len(rows))
   410  	bufferRows := make([]parquet.Row, 10)
   411  	reader := buffer.Rows()
   412  	defer reader.Close()
   413  	for {
   414  		n, err := reader.ReadRows(bufferRows)
   415  		resultRows = append(resultRows, bufferRows[:n]...)
   416  		if err != nil {
   417  			if err == io.EOF {
   418  				break
   419  			}
   420  			t.Fatal("reading rows:", err)
   421  		}
   422  	}
   423  
   424  	if len(resultRows) != len(rows) {
   425  		t.Errorf("wrong number of rows read: got=%d want=%d", len(resultRows), len(rows))
   426  	}
   427  }
   428  
   429  func TestOptionalPagePreserveIndex(t *testing.T) {
   430  	schema := parquet.SchemaOf(&testStruct{})
   431  	buffer := parquet.NewBuffer(schema)
   432  
   433  	_, err := buffer.WriteRows([]parquet.Row{
   434  		schema.Deconstruct(nil, &testStruct{Value: nil}),
   435  	})
   436  	if err != nil {
   437  		t.Fatal("writing row:", err)
   438  	}
   439  
   440  	rows := buffer.Rows()
   441  	defer rows.Close()
   442  
   443  	rowbuf := make([]parquet.Row, 2)
   444  
   445  	n, err := rows.ReadRows(rowbuf)
   446  	if err != nil && err != io.EOF {
   447  		t.Fatal("reading rows:", err)
   448  	}
   449  	if n != 1 {
   450  		t.Fatal("wrong number of rows returned:", n)
   451  	}
   452  	if rowbuf[0][0].Column() != 0 {
   453  		t.Errorf("wrong index: got=%d want=%d", rowbuf[0][0].Column(), 0)
   454  	}
   455  
   456  	n, err = rows.ReadRows(rowbuf)
   457  	if err != io.EOF {
   458  		t.Fatal("reading EOF:", err)
   459  	}
   460  	if n != 0 {
   461  		t.Fatal("expected no more rows after EOF:", n)
   462  	}
   463  }
   464  
   465  func TestRepeatedPageTrailingNulls(t *testing.T) {
   466  	type testStruct struct {
   467  		A []string `parquet:"a"`
   468  	}
   469  
   470  	s := parquet.SchemaOf(&testStruct{})
   471  
   472  	records := []*testStruct{
   473  		{A: nil},
   474  		{A: []string{"test"}},
   475  		{A: nil},
   476  	}
   477  
   478  	buf := parquet.NewBuffer(s)
   479  	for _, rec := range records {
   480  		row := s.Deconstruct(nil, rec)
   481  		_, err := buf.WriteRows([]parquet.Row{row})
   482  		if err != nil {
   483  			t.Fatal(err)
   484  		}
   485  	}
   486  
   487  	rows := make([]parquet.Row, len(records)+1)
   488  	reader := buf.Rows()
   489  	defer reader.Close()
   490  
   491  	n, err := reader.ReadRows(rows)
   492  	if err != nil && err != io.EOF {
   493  		t.Fatal("reading rows:", err)
   494  	}
   495  
   496  	if n != len(records) {
   497  		t.Errorf("wrong number of rows read: got=%d want=%d", n, len(records))
   498  	}
   499  }
   500  
   501  func TestReslicingBooleanPage(t *testing.T) {
   502  	type testStruct struct {
   503  		B bool `parquet:"b"`
   504  	}
   505  
   506  	numValues := 100
   507  	expected := []*testStruct{}
   508  	for i := 0; i < numValues; i++ {
   509  		expected = append(expected, &testStruct{B: i%2 == 0})
   510  	}
   511  
   512  	buf := new(bytes.Buffer)
   513  	writer := parquet.NewGenericWriter[*testStruct](buf)
   514  	_, err := writer.Write(expected)
   515  	if err != nil {
   516  		t.Fatal(err)
   517  	}
   518  	err = writer.Close()
   519  	if err != nil {
   520  		t.Fatal(err)
   521  	}
   522  
   523  	reader := bytes.NewReader(buf.Bytes())
   524  	pf, err := parquet.OpenFile(reader, reader.Size())
   525  	if err != nil {
   526  		t.Fatal(err)
   527  	}
   528  
   529  	// grab the page we wrote above
   530  	rg := pf.RowGroups()[0]
   531  	cc := rg.ColumnChunks()
   532  	pgs := cc[0].Pages()
   533  
   534  	pg, err := pgs.ReadPage()
   535  	if err != nil {
   536  		t.Fatal(err)
   537  	}
   538  
   539  	// continue reslicing and reading the values
   540  	sliceEvery := 3
   541  	for i := 0; i < numValues-1; i += sliceEvery {
   542  		vs := make([]parquet.Value, numValues)
   543  
   544  		low := int64(sliceEvery)
   545  		high := int64(numValues - i)
   546  
   547  		if low >= high {
   548  			break
   549  		}
   550  
   551  		// slice the page
   552  		pg = pg.Slice(low, high)
   553  		v := pg.Values()
   554  		v.ReadValues(vs)
   555  
   556  		// and the expected values with the same low/high
   557  		expected = expected[low:high]
   558  
   559  		// confirm values match
   560  		for n, exp := range expected {
   561  			if exp.B != vs[n].Boolean() {
   562  				t.Fatalf("unexpected value: %v at pos: %d", vs[n], n)
   563  			}
   564  			n++
   565  		}
   566  	}
   567  }