github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/external/parquet_test.go (about) 1 // Copyright 2024 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package external 16 17 import ( 18 "bytes" 19 "fmt" 20 "strings" 21 "testing" 22 23 "github.com/matrixorigin/matrixone/pkg/container/types" 24 "github.com/matrixorigin/matrixone/pkg/sql/plan" 25 "github.com/matrixorigin/matrixone/pkg/testutil" 26 "github.com/parquet-go/parquet-go" 27 "github.com/parquet-go/parquet-go/encoding" 28 "github.com/stretchr/testify/require" 29 ) 30 31 func Test_getMapper(t *testing.T) { 32 proc := testutil.NewProc() 33 34 t.Run("indexed string", func(t *testing.T) { 35 var buf bytes.Buffer 36 schema := parquet.NewSchema("x", parquet.Group{ 37 // TODO: check why parquet.PlainDictionary not work 38 "c": parquet.Compressed(parquet.Optional(parquet.Encoded(parquet.String(), &parquet.RLEDictionary)), &parquet.Gzip), 39 }) 40 w := parquet.NewWriter(&buf, schema) 41 42 long1 := strings.Repeat("xyzABC", 10) 43 long2 := strings.Repeat("789$&@", 10) 44 values := []parquet.Value{ 45 parquet.ValueOf(nil), 46 parquet.ValueOf("aa"), 47 parquet.ValueOf(nil), 48 parquet.ValueOf("bb"), 49 parquet.ValueOf("aa"), 50 parquet.ValueOf(long2), 51 parquet.ValueOf(long2), 52 parquet.ValueOf("aa"), 53 parquet.ValueOf("bb"), 54 parquet.ValueOf(long1), 55 parquet.ValueOf(nil), 56 parquet.ValueOf(nil), 57 parquet.ValueOf(long1), 58 } 59 for i := range values { 60 v := &values[i] 61 if v.IsNull() { 62 values[i] = v.Level(0, 0, 0) 63 } else { 64 values[i] = v.Level(0, 1, 0) 65 } 66 } 67 _, err := w.WriteRows([]parquet.Row{parquet.MakeRow(values)}) 68 require.NoError(t, err) 69 70 err = w.Close() 71 require.NoError(t, err) 72 73 f, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len())) 74 require.NoError(t, err) 75 76 col := f.Root().Column("c") 77 page, err := col.Pages().ReadPage() 78 require.NoError(t, err) 79 80 vec := proc.GetVector(types.New(types.T_varchar, 0, 0)) 81 var h ParquetHandler 82 err = h.getMapper(col, plan.Type{ 83 Id: int32(types.T_varchar), 84 }).mapping(page, proc, vec) 85 require.NoError(t, err) 86 87 require.Equal(t, len(values), vec.Length()) 88 for i, v := range values { 89 if v.IsNull() { 90 require.True(t, vec.IsNull(uint64(i))) 91 } else { 92 require.Equal(t, v.String(), vec.GetStringAt(i)) 93 } 94 } 95 }) 96 97 tests := []struct { 98 st parquet.Type 99 numValues int 100 values encoding.Values 101 dt types.T 102 expected string 103 expectedOpt string 104 }{ 105 { 106 st: parquet.BooleanType, 107 numValues: 2, 108 values: encoding.BooleanValues([]byte{2}), 109 dt: types.T_bool, 110 expectedOpt: "[false false true false]-[0 3]", 111 }, 112 { 113 st: parquet.Int32Type, 114 numValues: 2, 115 values: encoding.Int32Values([]int32{1, 5}), 116 dt: types.T_int32, 117 expectedOpt: "[0 1 5 0]-[0 3]", 118 }, 119 { 120 st: parquet.Int64Type, 121 numValues: 2, 122 values: encoding.Int64Values([]int64{2, 7}), 123 dt: types.T_int64, 124 expectedOpt: "[0 2 7 0]-[0 3]", 125 }, 126 { 127 st: parquet.Uint(32).Type(), 128 numValues: 2, 129 values: encoding.Uint32Values([]uint32{5, 3}), 130 dt: types.T_uint32, 131 expectedOpt: "[0 5 3 0]-[0 3]", 132 }, 133 { 134 st: parquet.Uint(64).Type(), 135 numValues: 2, 136 values: encoding.Uint64Values([]uint64{8, 10}), 137 dt: types.T_uint64, 138 expectedOpt: "[0 8 10 0]-[0 3]", 139 }, 140 { 141 st: parquet.Int64Type, 142 numValues: 2, 143 values: encoding.Int64Values([]int64{2, 7}), 144 dt: types.T_int64, 145 expectedOpt: "[0 2 7 0]-[0 3]", 146 }, 147 // { 148 // typ: parquet.Int96Type, 149 // }, 150 { 151 st: parquet.FloatType, 152 numValues: 2, 153 values: encoding.FloatValues([]float32{7.5, 3.2}), 154 dt: types.T_float32, 155 expectedOpt: "[0 7.5 3.2 0]-[0 3]", 156 }, 157 { 158 st: parquet.DoubleType, 159 numValues: 2, 160 values: encoding.DoubleValues([]float64{77.9, 0}), 161 dt: types.T_float64, 162 expectedOpt: "[0 77.9 0 0]-[0 3]", 163 }, 164 { 165 st: parquet.String().Type(), 166 numValues: 2, 167 values: encoding.ByteArrayValues([]byte("abcdefg"), []uint32{0, 3, 7}), 168 dt: types.T_varchar, 169 expectedOpt: "[ abc defg ]-[0 3]", 170 }, 171 { 172 st: parquet.FixedLenByteArrayType(3), 173 numValues: 2, 174 values: encoding.FixedLenByteArrayValues([]byte("abcdef"), 3), 175 dt: types.T_char, 176 expectedOpt: "[ abc def ]-[0 3]", 177 }, 178 { 179 st: parquet.Date().Type(), 180 numValues: 2, 181 values: encoding.Int32Values([]int32{357, 1245}), 182 dt: types.T_date, 183 expected: "[0001-12-24 0004-05-30]", 184 expectedOpt: "[0001-01-01 0001-12-24 0004-05-30 0001-01-01]-[0 3]", 185 }, 186 { 187 st: parquet.Time(parquet.Nanosecond).Type(), 188 numValues: 2, 189 values: encoding.Int64Values([]int64{18783_111111_111, 25783_222222_222}), 190 dt: types.T_time, 191 expected: "[05:13:03 07:09:43]", 192 expectedOpt: "[00:00:00 05:13:03 07:09:43 00:00:00]-[0 3]", 193 }, 194 { 195 st: parquet.Time(parquet.Microsecond).Type(), 196 numValues: 2, 197 values: encoding.Int64Values([]int64{18783_111111, 25783_222222}), 198 dt: types.T_time, 199 expected: "[05:13:03 07:09:43]", 200 expectedOpt: "[00:00:00 05:13:03 07:09:43 00:00:00]-[0 3]", 201 }, 202 { 203 st: parquet.Time(parquet.Millisecond).Type(), 204 numValues: 2, 205 values: encoding.Int32Values([]int32{18783_111, 25783_222}), 206 dt: types.T_time, 207 expected: "[05:13:03 07:09:43]", 208 expectedOpt: "[00:00:00 05:13:03 07:09:43 00:00:00]-[0 3]", 209 }, 210 { 211 st: parquet.Timestamp(parquet.Nanosecond).Type(), 212 numValues: 2, 213 values: encoding.Int64Values([]int64{1713419514_111111_111, 1713429514_222222_222}), 214 dt: types.T_timestamp, 215 expected: "[2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC]", 216 expectedOpt: "[0001-01-01 00:00:00.000000 UTC 2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC 0001-01-01 00:00:00.000000 UTC]-[0 3]", 217 }, 218 { 219 st: parquet.Timestamp(parquet.Microsecond).Type(), 220 numValues: 2, 221 values: encoding.Int64Values([]int64{1713419514_111111, 1713429514_222222}), 222 dt: types.T_timestamp, 223 expected: "[2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC]", 224 expectedOpt: "[0001-01-01 00:00:00.000000 UTC 2024-04-18 05:51:54.111111 UTC 2024-04-18 08:38:34.222222 UTC 0001-01-01 00:00:00.000000 UTC]-[0 3]", 225 }, 226 { 227 st: parquet.Timestamp(parquet.Millisecond).Type(), 228 numValues: 2, 229 values: encoding.Int64Values([]int64{1713419514_111, 1713429514_222}), 230 dt: types.T_timestamp, 231 expected: "[2024-04-18 05:51:54.111000 UTC 2024-04-18 08:38:34.222000 UTC]", 232 expectedOpt: "[0001-01-01 00:00:00.000000 UTC 2024-04-18 05:51:54.111000 UTC 2024-04-18 08:38:34.222000 UTC 0001-01-01 00:00:00.000000 UTC]-[0 3]", 233 }, 234 } 235 for _, tc := range tests { 236 t.Run(fmt.Sprintf("%s to %s not null", tc.st, tc.dt), func(t *testing.T) { 237 page := tc.st.NewPage(0, tc.numValues, tc.values) 238 239 var buf bytes.Buffer 240 schema := parquet.NewSchema("x", parquet.Group{ 241 "c": parquet.Leaf(tc.st), 242 }) 243 w := parquet.NewWriter(&buf, schema) 244 245 values := make([]parquet.Value, page.NumRows()) 246 page.Values().ReadValues(values) 247 _, err := w.WriteRows([]parquet.Row{parquet.MakeRow(values)}) 248 require.NoError(t, err) 249 err = w.Close() 250 require.NoError(t, err) 251 252 f, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len())) 253 require.NoError(t, err) 254 255 vec := proc.GetVector(types.New(tc.dt, 0, 0)) 256 var h ParquetHandler 257 err = h.getMapper(f.Root().Column("c"), plan.Type{ 258 Id: int32(tc.dt), 259 NotNullable: true, 260 }).mapping(page, proc, vec) 261 require.NoError(t, err) 262 if tc.expected != "" { 263 require.Equal(t, tc.expected, vec.String()) 264 } else { 265 require.Equal(t, fmt.Sprint(values), vec.String()) 266 } 267 }) 268 } 269 270 for _, tc := range tests { 271 t.Run(fmt.Sprintf("%s to %s null", tc.st, tc.dt), func(t *testing.T) { 272 var buf bytes.Buffer 273 schema := parquet.NewSchema("x", parquet.Group{ 274 "c": parquet.Optional(parquet.Leaf(tc.st)), 275 }) 276 w := parquet.NewWriter(&buf, schema) 277 278 err := w.Write(nil) 279 require.NoError(t, err) 280 281 page := tc.st.NewPage(0, tc.numValues, tc.values) 282 values := make([]parquet.Value, page.NumRows()) 283 page.Values().ReadValues(values) 284 for i := range values { 285 v := &values[i] 286 *v = v.Level(v.RepetitionLevel(), 1, v.Column()) 287 } 288 289 _, err = w.WriteRows([]parquet.Row{parquet.MakeRow(values)}) 290 require.NoError(t, err) 291 292 err = w.Write(nil) 293 require.NoError(t, err) 294 295 err = w.Close() 296 require.NoError(t, err) 297 298 f, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len())) 299 require.NoError(t, err) 300 301 vec := proc.GetVector(types.New(tc.dt, 0, 0)) 302 var h ParquetHandler 303 mp := h.getMapper(f.Root().Column("c"), plan.Type{ 304 Id: int32(tc.dt), 305 }) 306 307 pages := f.Root().Column("c").Pages() 308 page, _ = pages.ReadPage() 309 err = mp.mapping(page, proc, vec) 310 require.NoError(t, err) 311 if tc.expectedOpt != "" { 312 require.Equal(t, tc.expectedOpt, vec.String()) 313 } else { 314 require.Equal(t, fmt.Sprint(values), vec.String()) 315 } 316 }) 317 } 318 }