github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/external/parquet_test.go (about)

     1  // Copyright 2024 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package external
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"strings"
    21  	"testing"
    22  
    23  	"github.com/matrixorigin/matrixone/pkg/container/types"
    24  	"github.com/matrixorigin/matrixone/pkg/sql/plan"
    25  	"github.com/matrixorigin/matrixone/pkg/testutil"
    26  	"github.com/parquet-go/parquet-go"
    27  	"github.com/parquet-go/parquet-go/encoding"
    28  	"github.com/stretchr/testify/require"
    29  )
    30  
    31  func Test_getMapper(t *testing.T) {
    32  	proc := testutil.NewProc()
    33  
    34  	t.Run("indexed string", func(t *testing.T) {
    35  		var buf bytes.Buffer
    36  		schema := parquet.NewSchema("x", parquet.Group{
    37  			// TODO: check why parquet.PlainDictionary not work
    38  			"c": parquet.Compressed(parquet.Optional(parquet.Encoded(parquet.String(), &parquet.RLEDictionary)), &parquet.Gzip),
    39  		})
    40  		w := parquet.NewWriter(&buf, schema)
    41  
    42  		long1 := strings.Repeat("xyzABC", 10)
    43  		long2 := strings.Repeat("789$&@", 10)
    44  		values := []parquet.Value{
    45  			parquet.ValueOf(nil),
    46  			parquet.ValueOf("aa"),
    47  			parquet.ValueOf(nil),
    48  			parquet.ValueOf("bb"),
    49  			parquet.ValueOf("aa"),
    50  			parquet.ValueOf(long2),
    51  			parquet.ValueOf(long2),
    52  			parquet.ValueOf("aa"),
    53  			parquet.ValueOf("bb"),
    54  			parquet.ValueOf(long1),
    55  			parquet.ValueOf(nil),
    56  			parquet.ValueOf(nil),
    57  			parquet.ValueOf(long1),
    58  		}
    59  		for i := range values {
    60  			v := &values[i]
    61  			if v.IsNull() {
    62  				values[i] = v.Level(0, 0, 0)
    63  			} else {
    64  				values[i] = v.Level(0, 1, 0)
    65  			}
    66  		}
    67  		_, err := w.WriteRows([]parquet.Row{parquet.MakeRow(values)})
    68  		require.NoError(t, err)
    69  
    70  		err = w.Close()
    71  		require.NoError(t, err)
    72  
    73  		f, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
    74  		require.NoError(t, err)
    75  
    76  		col := f.Root().Column("c")
    77  		page, err := col.Pages().ReadPage()
    78  		require.NoError(t, err)
    79  
    80  		vec := proc.GetVector(types.New(types.T_varchar, 0, 0))
    81  		var h ParquetHandler
    82  		err = h.getMapper(col, plan.Type{
    83  			Id: int32(types.T_varchar),
    84  		}).mapping(page, proc, vec)
    85  		require.NoError(t, err)
    86  
    87  		require.Equal(t, len(values), vec.Length())
    88  		for i, v := range values {
    89  			if v.IsNull() {
    90  				require.True(t, vec.IsNull(uint64(i)))
    91  			} else {
    92  				require.Equal(t, v.String(), vec.GetStringAt(i))
    93  			}
    94  		}
    95  	})
    96  
    97  	tests := []struct {
    98  		st          parquet.Type
    99  		numValues   int
   100  		values      encoding.Values
   101  		dt          types.T
   102  		expected    string
   103  		expectedOpt string
   104  	}{
   105  		{
   106  			st:          parquet.BooleanType,
   107  			numValues:   2,
   108  			values:      encoding.BooleanValues([]byte{2}),
   109  			dt:          types.T_bool,
   110  			expectedOpt: "[false false true false]-[0 3]",
   111  		},
   112  		{
   113  			st:          parquet.Int32Type,
   114  			numValues:   2,
   115  			values:      encoding.Int32Values([]int32{1, 5}),
   116  			dt:          types.T_int32,
   117  			expectedOpt: "[0 1 5 0]-[0 3]",
   118  		},
   119  		{
   120  			st:          parquet.Int64Type,
   121  			numValues:   2,
   122  			values:      encoding.Int64Values([]int64{2, 7}),
   123  			dt:          types.T_int64,
   124  			expectedOpt: "[0 2 7 0]-[0 3]",
   125  		},
   126  		{
   127  			st:          parquet.Uint(32).Type(),
   128  			numValues:   2,
   129  			values:      encoding.Uint32Values([]uint32{5, 3}),
   130  			dt:          types.T_uint32,
   131  			expectedOpt: "[0 5 3 0]-[0 3]",
   132  		},
   133  		{
   134  			st:          parquet.Uint(64).Type(),
   135  			numValues:   2,
   136  			values:      encoding.Uint64Values([]uint64{8, 10}),
   137  			dt:          types.T_uint64,
   138  			expectedOpt: "[0 8 10 0]-[0 3]",
   139  		},
   140  		{
   141  			st:          parquet.Int64Type,
   142  			numValues:   2,
   143  			values:      encoding.Int64Values([]int64{2, 7}),
   144  			dt:          types.T_int64,
   145  			expectedOpt: "[0 2 7 0]-[0 3]",
   146  		},
   147  		// {
   148  		// 	typ: parquet.Int96Type,
   149  		// },
   150  		{
   151  			st:          parquet.FloatType,
   152  			numValues:   2,
   153  			values:      encoding.FloatValues([]float32{7.5, 3.2}),
   154  			dt:          types.T_float32,
   155  			expectedOpt: "[0 7.5 3.2 0]-[0 3]",
   156  		},
   157  		{
   158  			st:          parquet.DoubleType,
   159  			numValues:   2,
   160  			values:      encoding.DoubleValues([]float64{77.9, 0}),
   161  			dt:          types.T_float64,
   162  			expectedOpt: "[0 77.9 0 0]-[0 3]",
   163  		},
   164  		{
   165  			st:          parquet.String().Type(),
   166  			numValues:   2,
   167  			values:      encoding.ByteArrayValues([]byte("abcdefg"), []uint32{0, 3, 7}),
   168  			dt:          types.T_varchar,
   169  			expectedOpt: "[ abc defg ]-[0 3]",
   170  		},
   171  		{
   172  			st:          parquet.FixedLenByteArrayType(3),
   173  			numValues:   2,
   174  			values:      encoding.FixedLenByteArrayValues([]byte("abcdef"), 3),
   175  			dt:          types.T_char,
   176  			expectedOpt: "[ abc def ]-[0 3]",
   177  		},
   178  		{
   179  			st:          parquet.Date().Type(),
   180  			numValues:   2,
   181  			values:      encoding.Int32Values([]int32{357, 1245}),
   182  			dt:          types.T_date,
   183  			expected:    "[0001-12-24 0004-05-30]",
   184  			expectedOpt: "[0001-01-01 0001-12-24 0004-05-30 0001-01-01]-[0 3]",
   185  		},
   186  		{
   187  			st:          parquet.Time(parquet.Nanosecond).Type(),
   188  			numValues:   2,
   189  			values:      encoding.Int64Values([]int64{18783_111111_111, 25783_222222_222}),
   190  			dt:          types.T_time,
   191  			expected:    "[05:13:03 07:09:43]",
   192  			expectedOpt: "[00:00:00 05:13:03 07:09:43 00:00:00]-[0 3]",
   193  		},
   194  		{
   195  			st:          parquet.Time(parquet.Microsecond).Type(),
   196  			numValues:   2,
   197  			values:      encoding.Int64Values([]int64{18783_111111, 25783_222222}),
   198  			dt:          types.T_time,
   199  			expected:    "[05:13:03 07:09:43]",
   200  			expectedOpt: "[00:00:00 05:13:03 07:09:43 00:00:00]-[0 3]",
   201  		},
   202  		{
   203  			st:          parquet.Time(parquet.Millisecond).Type(),
   204  			numValues:   2,
   205  			values:      encoding.Int32Values([]int32{18783_111, 25783_222}),
   206  			dt:          types.T_time,
   207  			expected:    "[05:13:03 07:09:43]",
   208  			expectedOpt: "[00:00:00 05:13:03 07:09:43 00:00:00]-[0 3]",
   209  		},
   210  		{
   211  			st:          parquet.Timestamp(parquet.Nanosecond).Type(),
   212  			numValues:   2,
   213  			values:      encoding.Int64Values([]int64{1713419514_111111_111, 1713429514_222222_222}),
   214  			dt:          types.T_timestamp,
   215  			expected:    "[2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC]",
   216  			expectedOpt: "[0001-01-01 00:00:00.000000 UTC 2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC 0001-01-01 00:00:00.000000 UTC]-[0 3]",
   217  		},
   218  		{
   219  			st:          parquet.Timestamp(parquet.Microsecond).Type(),
   220  			numValues:   2,
   221  			values:      encoding.Int64Values([]int64{1713419514_111111, 1713429514_222222}),
   222  			dt:          types.T_timestamp,
   223  			expected:    "[2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC]",
   224  			expectedOpt: "[0001-01-01 00:00:00.000000 UTC 2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC 0001-01-01 00:00:00.000000 UTC]-[0 3]",
   225  		},
   226  		{
   227  			st:          parquet.Timestamp(parquet.Millisecond).Type(),
   228  			numValues:   2,
   229  			values:      encoding.Int64Values([]int64{1713419514_111, 1713429514_222}),
   230  			dt:          types.T_timestamp,
   231  			expected:    "[2024-04-18 05:51:54.111000 UTC 2024-04-18 08:38:34.222000 UTC]",
   232  			expectedOpt: "[0001-01-01 00:00:00.000000 UTC 2024-04-18 05:51:54.111000 UTC 2024-04-18 08:38:34.222000 UTC 0001-01-01 00:00:00.000000 UTC]-[0 3]",
   233  		},
   234  	}
   235  	for _, tc := range tests {
   236  		t.Run(fmt.Sprintf("%s to %s not null", tc.st, tc.dt), func(t *testing.T) {
   237  			page := tc.st.NewPage(0, tc.numValues, tc.values)
   238  
   239  			var buf bytes.Buffer
   240  			schema := parquet.NewSchema("x", parquet.Group{
   241  				"c": parquet.Leaf(tc.st),
   242  			})
   243  			w := parquet.NewWriter(&buf, schema)
   244  
   245  			values := make([]parquet.Value, page.NumRows())
   246  			page.Values().ReadValues(values)
   247  			_, err := w.WriteRows([]parquet.Row{parquet.MakeRow(values)})
   248  			require.NoError(t, err)
   249  			err = w.Close()
   250  			require.NoError(t, err)
   251  
   252  			f, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
   253  			require.NoError(t, err)
   254  
   255  			vec := proc.GetVector(types.New(tc.dt, 0, 0))
   256  			var h ParquetHandler
   257  			err = h.getMapper(f.Root().Column("c"), plan.Type{
   258  				Id:          int32(tc.dt),
   259  				NotNullable: true,
   260  			}).mapping(page, proc, vec)
   261  			require.NoError(t, err)
   262  			if tc.expected != "" {
   263  				require.Equal(t, tc.expected, vec.String())
   264  			} else {
   265  				require.Equal(t, fmt.Sprint(values), vec.String())
   266  			}
   267  		})
   268  	}
   269  
   270  	for _, tc := range tests {
   271  		t.Run(fmt.Sprintf("%s to %s null", tc.st, tc.dt), func(t *testing.T) {
   272  			var buf bytes.Buffer
   273  			schema := parquet.NewSchema("x", parquet.Group{
   274  				"c": parquet.Optional(parquet.Leaf(tc.st)),
   275  			})
   276  			w := parquet.NewWriter(&buf, schema)
   277  
   278  			err := w.Write(nil)
   279  			require.NoError(t, err)
   280  
   281  			page := tc.st.NewPage(0, tc.numValues, tc.values)
   282  			values := make([]parquet.Value, page.NumRows())
   283  			page.Values().ReadValues(values)
   284  			for i := range values {
   285  				v := &values[i]
   286  				*v = v.Level(v.RepetitionLevel(), 1, v.Column())
   287  			}
   288  
   289  			_, err = w.WriteRows([]parquet.Row{parquet.MakeRow(values)})
   290  			require.NoError(t, err)
   291  
   292  			err = w.Write(nil)
   293  			require.NoError(t, err)
   294  
   295  			err = w.Close()
   296  			require.NoError(t, err)
   297  
   298  			f, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
   299  			require.NoError(t, err)
   300  
   301  			vec := proc.GetVector(types.New(tc.dt, 0, 0))
   302  			var h ParquetHandler
   303  			mp := h.getMapper(f.Root().Column("c"), plan.Type{
   304  				Id: int32(tc.dt),
   305  			})
   306  
   307  			pages := f.Root().Column("c").Pages()
   308  			page, _ = pages.ReadPage()
   309  			err = mp.mapping(page, proc, vec)
   310  			require.NoError(t, err)
   311  			if tc.expectedOpt != "" {
   312  				require.Equal(t, tc.expectedOpt, vec.String())
   313  			} else {
   314  				require.Equal(t, fmt.Sprint(values), vec.String())
   315  			}
   316  		})
   317  	}
   318  }