github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/page_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"io"
     6  	"reflect"
     7  	"testing"
     8  
     9  	"github.com/vc42/parquet-go"
    10  	"github.com/vc42/parquet-go/deprecated"
    11  	"github.com/vc42/parquet-go/encoding/plain"
    12  	"github.com/vc42/parquet-go/internal/unsafecast"
    13  )
    14  
    15  func TestPage(t *testing.T) {
    16  	t.Run("BOOLEAN", testPageBoolean)
    17  	t.Run("INT32", testPageInt32)
    18  	t.Run("INT64", testPageInt64)
    19  	t.Run("INT96", testPageInt96)
    20  	t.Run("FLOAT", testPageFloat)
    21  	t.Run("DOUBLE", testPageDouble)
    22  	t.Run("BYTE_ARRAY", testPageByteArray)
    23  	t.Run("FIXED_LEN_BYTE_ARRAY", testPageFixedLenByteArray)
    24  }
    25  
    26  func testPageBoolean(t *testing.T) {
    27  	schema := parquet.SchemaOf(struct{ Value bool }{})
    28  
    29  	t.Run("parquet", func(t *testing.T) {
    30  		testPage(t, schema, pageTest{
    31  			write: func(w parquet.ValueWriter) (interface{}, error) {
    32  				values := []bool{false, true}
    33  				n, err := w.(parquet.BooleanWriter).WriteBooleans(values)
    34  				return values[:n], err
    35  			},
    36  
    37  			read: func(r parquet.ValueReader) (interface{}, error) {
    38  				values := make([]bool, 2)
    39  				n, err := r.(parquet.BooleanReader).ReadBooleans(values)
    40  				return values[:n], err
    41  			},
    42  		})
    43  	})
    44  }
    45  
    46  func testPageInt32(t *testing.T) {
    47  	schema := parquet.SchemaOf(struct{ Value int32 }{})
    48  
    49  	t.Run("io", func(t *testing.T) {
    50  		testBufferPage(t, schema, pageTest{
    51  			write: func(w parquet.ValueWriter) (interface{}, error) {
    52  				values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    53  				n, err := w.(io.Writer).Write(unsafecast.Int32ToBytes(values))
    54  				return values[:n/4], err
    55  			},
    56  
    57  			read: func(r parquet.ValueReader) (interface{}, error) {
    58  				values := make([]int32, 10)
    59  				n, err := r.(io.Reader).Read(unsafecast.Int32ToBytes(values))
    60  				return values[:n/4], err
    61  			},
    62  		})
    63  	})
    64  
    65  	t.Run("parquet", func(t *testing.T) {
    66  		testPage(t, schema, pageTest{
    67  			write: func(w parquet.ValueWriter) (interface{}, error) {
    68  				values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    69  				n, err := w.(parquet.Int32Writer).WriteInt32s(values)
    70  				return values[:n], err
    71  			},
    72  
    73  			read: func(r parquet.ValueReader) (interface{}, error) {
    74  				values := make([]int32, 10)
    75  				n, err := r.(parquet.Int32Reader).ReadInt32s(values)
    76  				return values[:n], err
    77  			},
    78  		})
    79  	})
    80  }
    81  
    82  func testPageInt64(t *testing.T) {
    83  	schema := parquet.SchemaOf(struct{ Value int64 }{})
    84  
    85  	t.Run("io", func(t *testing.T) {
    86  		testBufferPage(t, schema, pageTest{
    87  			write: func(w parquet.ValueWriter) (interface{}, error) {
    88  				values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
    89  				n, err := w.(io.Writer).Write(unsafecast.Int64ToBytes(values))
    90  				return values[:n/8], err
    91  			},
    92  
    93  			read: func(r parquet.ValueReader) (interface{}, error) {
    94  				values := make([]int64, 10)
    95  				n, err := r.(io.Reader).Read(unsafecast.Int64ToBytes(values))
    96  				return values[:n/8], err
    97  			},
    98  		})
    99  	})
   100  
   101  	t.Run("parquet", func(t *testing.T) {
   102  		testPage(t, schema, pageTest{
   103  			write: func(w parquet.ValueWriter) (interface{}, error) {
   104  				values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   105  				n, err := w.(parquet.Int64Writer).WriteInt64s(values)
   106  				return values[:n], err
   107  			},
   108  
   109  			read: func(r parquet.ValueReader) (interface{}, error) {
   110  				values := make([]int64, 10)
   111  				n, err := r.(parquet.Int64Reader).ReadInt64s(values)
   112  				return values[:n], err
   113  			},
   114  		})
   115  	})
   116  }
   117  
   118  func testPageInt96(t *testing.T) {
   119  	schema := parquet.SchemaOf(struct{ Value deprecated.Int96 }{})
   120  
   121  	t.Run("io", func(t *testing.T) {
   122  		testBufferPage(t, schema, pageTest{
   123  			write: func(w parquet.ValueWriter) (interface{}, error) {
   124  				values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}}
   125  				n, err := w.(io.Writer).Write(deprecated.Int96ToBytes(values))
   126  				return values[:n/12], err
   127  			},
   128  
   129  			read: func(r parquet.ValueReader) (interface{}, error) {
   130  				values := make([]deprecated.Int96, 3)
   131  				n, err := r.(io.Reader).Read(deprecated.Int96ToBytes(values))
   132  				return values[:n/12], err
   133  			},
   134  		})
   135  	})
   136  
   137  	t.Run("parquet", func(t *testing.T) {
   138  		testPage(t, schema, pageTest{
   139  			write: func(w parquet.ValueWriter) (interface{}, error) {
   140  				values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}}
   141  				n, err := w.(parquet.Int96Writer).WriteInt96s(values)
   142  				return values[:n], err
   143  			},
   144  
   145  			read: func(r parquet.ValueReader) (interface{}, error) {
   146  				values := make([]deprecated.Int96, 3)
   147  				n, err := r.(parquet.Int96Reader).ReadInt96s(values)
   148  				return values[:n], err
   149  			},
   150  		})
   151  	})
   152  }
   153  
   154  func testPageFloat(t *testing.T) {
   155  	schema := parquet.SchemaOf(struct{ Value float32 }{})
   156  
   157  	t.Run("io", func(t *testing.T) {
   158  		testBufferPage(t, schema, pageTest{
   159  			write: func(w parquet.ValueWriter) (interface{}, error) {
   160  				values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   161  				n, err := w.(io.Writer).Write(unsafecast.Float32ToBytes(values))
   162  				return values[:n/4], err
   163  			},
   164  
   165  			read: func(r parquet.ValueReader) (interface{}, error) {
   166  				values := make([]float32, 10)
   167  				n, err := r.(io.Reader).Read(unsafecast.Float32ToBytes(values))
   168  				return values[:n/4], err
   169  			},
   170  		})
   171  	})
   172  
   173  	t.Run("parquet", func(t *testing.T) {
   174  		testPage(t, schema, pageTest{
   175  			write: func(w parquet.ValueWriter) (interface{}, error) {
   176  				values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   177  				n, err := w.(parquet.FloatWriter).WriteFloats(values)
   178  				return values[:n], err
   179  			},
   180  
   181  			read: func(r parquet.ValueReader) (interface{}, error) {
   182  				values := make([]float32, 10)
   183  				n, err := r.(parquet.FloatReader).ReadFloats(values)
   184  				return values[:n], err
   185  			},
   186  		})
   187  	})
   188  }
   189  
   190  func testPageDouble(t *testing.T) {
   191  	schema := parquet.SchemaOf(struct{ Value float64 }{})
   192  
   193  	t.Run("io", func(t *testing.T) {
   194  		testBufferPage(t, schema, pageTest{
   195  			write: func(w parquet.ValueWriter) (interface{}, error) {
   196  				values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   197  				n, err := w.(io.Writer).Write(unsafecast.Float64ToBytes(values))
   198  				return values[:n/8], err
   199  			},
   200  
   201  			read: func(r parquet.ValueReader) (interface{}, error) {
   202  				values := make([]float64, 10)
   203  				n, err := r.(io.Reader).Read(unsafecast.Float64ToBytes(values))
   204  				return values[:n/8], err
   205  			},
   206  		})
   207  	})
   208  
   209  	t.Run("parquet", func(t *testing.T) {
   210  		testPage(t, schema, pageTest{
   211  			write: func(w parquet.ValueWriter) (interface{}, error) {
   212  				values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
   213  				n, err := w.(parquet.DoubleWriter).WriteDoubles(values)
   214  				return values[:n], err
   215  			},
   216  
   217  			read: func(r parquet.ValueReader) (interface{}, error) {
   218  				values := make([]float64, 10)
   219  				n, err := r.(parquet.DoubleReader).ReadDoubles(values)
   220  				return values[:n], err
   221  			},
   222  		})
   223  	})
   224  }
   225  
   226  func testPageByteArray(t *testing.T) {
   227  	schema := parquet.SchemaOf(struct{ Value []byte }{})
   228  
   229  	t.Run("io", func(t *testing.T) {
   230  		testBufferPage(t, schema, pageTest{
   231  			write: func(w parquet.ValueWriter) (interface{}, error) {
   232  				values := []byte{}
   233  				values = plain.AppendByteArray(values, []byte("A"))
   234  				values = plain.AppendByteArray(values, []byte("B"))
   235  				values = plain.AppendByteArray(values, []byte("C"))
   236  				n, err := w.(io.Writer).Write(values)
   237  				return values[:n], err
   238  			},
   239  
   240  			read: func(r parquet.ValueReader) (interface{}, error) {
   241  				values := make([]byte, 3+3*plain.ByteArrayLengthSize)
   242  				n, err := r.(io.Reader).Read(values)
   243  				return values[:n], err
   244  			},
   245  		})
   246  	})
   247  
   248  	t.Run("parquet", func(t *testing.T) {
   249  		testPage(t, schema, pageTest{
   250  			write: func(w parquet.ValueWriter) (interface{}, error) {
   251  				values := []byte{}
   252  				values = plain.AppendByteArray(values, []byte("A"))
   253  				values = plain.AppendByteArray(values, []byte("B"))
   254  				values = plain.AppendByteArray(values, []byte("C"))
   255  				_, err := w.(parquet.ByteArrayWriter).WriteByteArrays(values)
   256  				return values, err
   257  			},
   258  
   259  			read: func(r parquet.ValueReader) (interface{}, error) {
   260  				values := make([]byte, 3+3*plain.ByteArrayLengthSize)
   261  				n, err := r.(parquet.ByteArrayReader).ReadByteArrays(values)
   262  				return values[:n+n*plain.ByteArrayLengthSize], err
   263  			},
   264  		})
   265  	})
   266  }
   267  
   268  func testPageFixedLenByteArray(t *testing.T) {
   269  	schema := parquet.SchemaOf(struct{ Value [3]byte }{})
   270  
   271  	t.Run("io", func(t *testing.T) {
   272  		testBufferPage(t, schema, pageTest{
   273  			write: func(w parquet.ValueWriter) (interface{}, error) {
   274  				values := []byte("123456789")
   275  				n, err := w.(io.Writer).Write(values)
   276  				return values[:n], err
   277  			},
   278  
   279  			read: func(r parquet.ValueReader) (interface{}, error) {
   280  				values := make([]byte, 3*3)
   281  				n, err := r.(io.Reader).Read(values)
   282  				return values[:n], err
   283  			},
   284  		})
   285  	})
   286  
   287  	t.Run("parquet", func(t *testing.T) {
   288  		testPage(t, schema, pageTest{
   289  			write: func(w parquet.ValueWriter) (interface{}, error) {
   290  				values := []byte("123456789")
   291  				n, err := w.(parquet.FixedLenByteArrayWriter).WriteFixedLenByteArrays(values)
   292  				return values[:3*n], err
   293  			},
   294  
   295  			read: func(r parquet.ValueReader) (interface{}, error) {
   296  				values := make([]byte, 3*3)
   297  				n, err := r.(parquet.FixedLenByteArrayReader).ReadFixedLenByteArrays(values)
   298  				return values[:3*n], err
   299  			},
   300  		})
   301  	})
   302  }
   303  
   304  type pageTest struct {
   305  	write func(parquet.ValueWriter) (interface{}, error)
   306  	read  func(parquet.ValueReader) (interface{}, error)
   307  }
   308  
   309  func testPage(t *testing.T, schema *parquet.Schema, test pageTest) {
   310  	t.Run("buffer", func(t *testing.T) { testBufferPage(t, schema, test) })
   311  	t.Run("file", func(t *testing.T) { testFilePage(t, schema, test) })
   312  }
   313  
   314  func testBufferPage(t *testing.T, schema *parquet.Schema, test pageTest) {
   315  	buffer := parquet.NewBuffer(schema)
   316  	column := buffer.ColumnBuffers()[0]
   317  
   318  	w, err := test.write(column)
   319  	if err != nil {
   320  		t.Fatal("writing page values:", err)
   321  	}
   322  
   323  	r, err := test.read(column.Page().Values())
   324  	if err != io.EOF {
   325  		t.Errorf("expected io.EOF after reading all values but got %v", err)
   326  	}
   327  	if !reflect.DeepEqual(w, r) {
   328  		t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w)
   329  	}
   330  }
   331  
   332  func testFilePage(t *testing.T, schema *parquet.Schema, test pageTest) {
   333  	buffer := parquet.NewBuffer(schema)
   334  	column := buffer.ColumnBuffers()[0]
   335  
   336  	w, err := test.write(column)
   337  	if err != nil {
   338  		t.Fatal("writing page values:", err)
   339  	}
   340  
   341  	output := new(bytes.Buffer)
   342  	writer := parquet.NewWriter(output)
   343  	n, err := writer.WriteRowGroup(buffer)
   344  	if err != nil {
   345  		t.Fatal("writing parquet file:", err)
   346  	}
   347  	if err := writer.Close(); err != nil {
   348  		t.Fatal("writing parquet file:", err)
   349  	}
   350  	if n != buffer.NumRows() {
   351  		t.Fatalf("number of rows written mismatch: got=%d want=%d", n, buffer.NumRows())
   352  	}
   353  
   354  	reader := bytes.NewReader(output.Bytes())
   355  	f, err := parquet.OpenFile(reader, reader.Size())
   356  	if err != nil {
   357  		t.Fatal("opening parquet file:", err)
   358  	}
   359  
   360  	pages := f.RowGroups()[0].ColumnChunks()[0].Pages()
   361  	defer pages.Close()
   362  
   363  	p, err := pages.ReadPage()
   364  	if err != nil {
   365  		t.Fatal("reading parquet page:", err)
   366  	}
   367  
   368  	values := p.Values()
   369  	r, err := test.read(values)
   370  	if err != io.EOF && err != nil {
   371  		t.Errorf("expected io.EOF after reading all values but got %v", err)
   372  	}
   373  	if !reflect.DeepEqual(w, r) {
   374  		t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w)
   375  	}
   376  	if r, err := test.read(values); reflect.ValueOf(r).Len() != 0 || err != io.EOF {
   377  		t.Errorf("expected no data and io.EOF after reading all values but got %d and %v", r, err)
   378  	}
   379  }
   380  
   381  type testStruct struct {
   382  	Value *string
   383  }
   384  
   385  func TestOptionalPageTrailingNulls(t *testing.T) {
   386  	schema := parquet.SchemaOf(&testStruct{})
   387  	buffer := parquet.NewBuffer(schema)
   388  
   389  	str := "test"
   390  	rows := []testStruct{{
   391  		Value: nil,
   392  	}, {
   393  		Value: &str,
   394  	}, {
   395  		Value: nil,
   396  	}}
   397  
   398  	for _, row := range rows {
   399  		_, err := buffer.WriteRows([]parquet.Row{schema.Deconstruct(nil, row)})
   400  		if err != nil {
   401  			t.Fatal("writing row:", err)
   402  		}
   403  	}
   404  
   405  	resultRows := make([]parquet.Row, 0, len(rows))
   406  	bufferRows := make([]parquet.Row, 10)
   407  	reader := buffer.Rows()
   408  	defer reader.Close()
   409  	for {
   410  		n, err := reader.ReadRows(bufferRows)
   411  		resultRows = append(resultRows, bufferRows[:n]...)
   412  		if err != nil {
   413  			if err == io.EOF {
   414  				break
   415  			}
   416  			t.Fatal("reading rows:", err)
   417  		}
   418  	}
   419  
   420  	if len(resultRows) != len(rows) {
   421  		t.Errorf("wrong number of rows read: got=%d want=%d", len(resultRows), len(rows))
   422  	}
   423  }
   424  
   425  func TestOptionalPagePreserveIndex(t *testing.T) {
   426  	schema := parquet.SchemaOf(&testStruct{})
   427  	buffer := parquet.NewBuffer(schema)
   428  
   429  	_, err := buffer.WriteRows([]parquet.Row{
   430  		schema.Deconstruct(nil, &testStruct{Value: nil}),
   431  	})
   432  	if err != nil {
   433  		t.Fatal("writing row:", err)
   434  	}
   435  
   436  	rows := buffer.Rows()
   437  	defer rows.Close()
   438  
   439  	rowbuf := make([]parquet.Row, 2)
   440  	n, err := rows.ReadRows(rowbuf)
   441  	if err != io.EOF {
   442  		t.Fatal("reading rows:", err)
   443  	}
   444  	if n != 1 {
   445  		t.Fatal("wrong number of rows returned:", n)
   446  	}
   447  	if rowbuf[0][0].Column() != 0 {
   448  		t.Errorf("wrong index: got=%d want=%d", rowbuf[0][0].Column(), 0)
   449  	}
   450  }
   451  
   452  func TestRepeatedPageTrailingNulls(t *testing.T) {
   453  	type testStruct struct {
   454  		A []string `parquet:"a"`
   455  	}
   456  
   457  	s := parquet.SchemaOf(&testStruct{})
   458  
   459  	records := []*testStruct{
   460  		{A: nil},
   461  		{A: []string{"test"}},
   462  		{A: nil},
   463  	}
   464  
   465  	buf := parquet.NewBuffer(s)
   466  	for _, rec := range records {
   467  		row := s.Deconstruct(nil, rec)
   468  		_, err := buf.WriteRows([]parquet.Row{row})
   469  		if err != nil {
   470  			t.Fatal(err)
   471  		}
   472  	}
   473  
   474  	rows := make([]parquet.Row, len(records)+1)
   475  	reader := buf.Rows()
   476  	defer reader.Close()
   477  
   478  	n, err := reader.ReadRows(rows)
   479  	if err != io.EOF {
   480  		t.Fatal("reading rows:", err)
   481  	}
   482  
   483  	if n != len(records) {
   484  		t.Errorf("wrong number of rows read: got=%d want=%d", n, len(records))
   485  	}
   486  }