github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/parquet_parser_test.go (about)

     1  package mydump
     2  
     3  import (
     4  	"context"
     5  	"io"
     6  	"path/filepath"
     7  	"strconv"
     8  	"time"
     9  
    10  	. "github.com/pingcap/check"
    11  	"github.com/pingcap/tidb/types"
    12  	"github.com/xitongsys/parquet-go-source/local"
    13  	writer2 "github.com/xitongsys/parquet-go/writer"
    14  
    15  	"github.com/pingcap/br/pkg/storage"
    16  )
    17  
    18  type testParquetParserSuite struct{}
    19  
    20  var _ = Suite(testParquetParserSuite{})
    21  
    22  func (s testParquetParserSuite) TestParquetParser(c *C) {
    23  	type Test struct {
    24  		S string `parquet:"name=sS, type=UTF8, encoding=PLAIN_DICTIONARY"`
    25  		A int32  `parquet:"name=a_A, type=INT32"`
    26  	}
    27  
    28  	dir := c.MkDir()
    29  	// prepare data
    30  	name := "test123.parquet"
    31  	testPath := filepath.Join(dir, name)
    32  	pf, err := local.NewLocalFileWriter(testPath)
    33  	c.Assert(err, IsNil)
    34  	test := &Test{}
    35  	writer, err := writer2.NewParquetWriter(pf, test, 2)
    36  	c.Assert(err, IsNil)
    37  
    38  	for i := 0; i < 100; i++ {
    39  		test.A = int32(i)
    40  		test.S = strconv.Itoa(i)
    41  		c.Assert(writer.Write(test), IsNil)
    42  	}
    43  
    44  	c.Assert(writer.WriteStop(), IsNil)
    45  	c.Assert(pf.Close(), IsNil)
    46  
    47  	store, err := storage.NewLocalStorage(dir)
    48  	c.Assert(err, IsNil)
    49  	r, err := store.Open(context.TODO(), name)
    50  	c.Assert(err, IsNil)
    51  	reader, err := NewParquetParser(context.TODO(), store, r, name)
    52  	c.Assert(err, IsNil)
    53  	defer reader.Close()
    54  
    55  	c.Assert(reader.Columns(), DeepEquals, []string{"ss", "a_a"})
    56  
    57  	verifyRow := func(i int) {
    58  		c.Assert(reader.lastRow.RowID, Equals, int64(i+1))
    59  		c.Assert(len(reader.lastRow.Row), Equals, 2)
    60  		c.Assert(reader.lastRow.Row[0], DeepEquals, types.NewCollationStringDatum(strconv.Itoa(i), "", 0))
    61  		c.Assert(reader.lastRow.Row[1], DeepEquals, types.NewIntDatum(int64(i)))
    62  	}
    63  
    64  	// test read some rows
    65  	for i := 0; i < 10; i++ {
    66  		c.Assert(reader.ReadRow(), IsNil)
    67  		verifyRow(i)
    68  	}
    69  
    70  	// test set pos to pos < curpos + batchReadRowSize
    71  	c.Assert(reader.SetPos(15, 15), IsNil)
    72  	c.Assert(reader.ReadRow(), IsNil)
    73  	verifyRow(15)
    74  
    75  	// test set pos to pos > curpos + batchReadRowSize
    76  	c.Assert(reader.SetPos(80, 80), IsNil)
    77  	for i := 80; i < 100; i++ {
    78  		c.Assert(reader.ReadRow(), IsNil)
    79  		verifyRow(i)
    80  	}
    81  
    82  	c.Assert(reader.ReadRow(), Equals, io.EOF)
    83  }
    84  
    85  func (s testParquetParserSuite) TestParquetVariousTypes(c *C) {
    86  	// those deprecated TIME/TIMESTAMP types depend on the local timezone!
    87  	prevTZ := time.Local
    88  	time.Local = time.FixedZone("UTC+8", 8*60*60)
    89  	defer func() {
    90  		time.Local = prevTZ
    91  	}()
    92  
    93  	type Test struct {
    94  		Date            int32 `parquet:"name=date, type=DATE"`
    95  		TimeMillis      int32 `parquet:"name=timemillis, type=TIME_MILLIS"`
    96  		TimeMicros      int64 `parquet:"name=timemicros, type=TIME_MICROS"`
    97  		TimestampMillis int64 `parquet:"name=timestampmillis, type=TIMESTAMP_MILLIS"`
    98  		TimestampMicros int64 `parquet:"name=timestampmicros, type=TIMESTAMP_MICROS"`
    99  
   100  		Decimal1 int32 `parquet:"name=decimal1, type=DECIMAL, scale=2, precision=9, basetype=INT32"`
   101  		Decimal2 int32 `parquet:"name=decimal2, type=DECIMAL, scale=4, precision=4, basetype=INT32"`
   102  		Decimal3 int64 `parquet:"name=decimal3, type=DECIMAL, scale=2, precision=18, basetype=INT64"`
   103  		Decimal6 int32 `parquet:"name=decimal6, type=DECIMAL, scale=4, precision=4, basetype=INT32"`
   104  	}
   105  
   106  	dir := c.MkDir()
   107  	// prepare data
   108  	name := "test123.parquet"
   109  	testPath := filepath.Join(dir, name)
   110  	pf, err := local.NewLocalFileWriter(testPath)
   111  	c.Assert(err, IsNil)
   112  	test := &Test{}
   113  	writer, err := writer2.NewParquetWriter(pf, test, 2)
   114  	c.Assert(err, IsNil)
   115  
   116  	v := &Test{
   117  		Date:            18564,              // 2020-10-29
   118  		TimeMillis:      62775123,           // 17:26:15.123 (note all time are in UTC+8!)
   119  		TimeMicros:      62775123456,        // 17:26:15.123
   120  		TimestampMillis: 1603963672356,      // 2020-10-29T09:27:52.356Z
   121  		TimestampMicros: 1603963672356956,   // 2020-10-29T09:27:52.356956Z
   122  		Decimal1:        -12345678,          // -123456.78
   123  		Decimal2:        456,                // 0.0456
   124  		Decimal3:        123456789012345678, // 1234567890123456.78
   125  		Decimal6:        -1,                 // -0.0001
   126  	}
   127  	c.Assert(writer.Write(v), IsNil)
   128  	c.Assert(writer.WriteStop(), IsNil)
   129  	c.Assert(pf.Close(), IsNil)
   130  
   131  	store, err := storage.NewLocalStorage(dir)
   132  	c.Assert(err, IsNil)
   133  	r, err := store.Open(context.TODO(), name)
   134  	c.Assert(err, IsNil)
   135  	reader, err := NewParquetParser(context.TODO(), store, r, name)
   136  	c.Assert(err, IsNil)
   137  	defer reader.Close()
   138  
   139  	c.Assert(len(reader.columns), Equals, 9)
   140  
   141  	c.Assert(reader.ReadRow(), IsNil)
   142  	rowValue := []string{
   143  		"2020-10-29", "17:26:15.123Z", "17:26:15.123456Z", "2020-10-29 09:27:52.356Z", "2020-10-29 09:27:52.356956Z",
   144  		"-123456.78", "0.0456", "1234567890123456.78", "-0.0001",
   145  	}
   146  	row := reader.lastRow.Row
   147  	c.Assert(len(rowValue), Equals, len(row))
   148  	for i := 0; i < len(row); i++ {
   149  		c.Assert(row[i].Kind(), Equals, types.KindString)
   150  		c.Assert(rowValue[i], Equals, row[i].GetString())
   151  	}
   152  
   153  	type TestDecimal struct {
   154  		Decimal1   int32  `parquet:"name=decimal1, type=DECIMAL, scale=3, precision=5, basetype=INT32"`
   155  		DecimalRef *int32 `parquet:"name=decimal2, type=DECIMAL, scale=3, precision=5, basetype=INT32"`
   156  	}
   157  
   158  	cases := [][]interface{}{
   159  		{int32(0), "0.000"},
   160  		{int32(1000), "1.000"},
   161  		{int32(-1000), "-1.000"},
   162  		{int32(999), "0.999"},
   163  		{int32(-999), "-0.999"},
   164  		{int32(1), "0.001"},
   165  		{int32(-1), "-0.001"},
   166  	}
   167  
   168  	fileName := "test.02.parquet"
   169  	testPath = filepath.Join(dir, fileName)
   170  	pf, err = local.NewLocalFileWriter(testPath)
   171  	td := &TestDecimal{}
   172  	c.Assert(err, IsNil)
   173  	writer, err = writer2.NewParquetWriter(pf, td, 2)
   174  	c.Assert(err, IsNil)
   175  	for i, testCase := range cases {
   176  		val := testCase[0].(int32)
   177  		td.Decimal1 = val
   178  		if i%2 == 0 {
   179  			td.DecimalRef = &val
   180  		} else {
   181  			td.DecimalRef = nil
   182  		}
   183  		c.Assert(writer.Write(td), IsNil)
   184  	}
   185  	c.Assert(writer.WriteStop(), IsNil)
   186  	c.Assert(pf.Close(), IsNil)
   187  
   188  	r, err = store.Open(context.TODO(), fileName)
   189  	c.Assert(err, IsNil)
   190  	reader, err = NewParquetParser(context.TODO(), store, r, fileName)
   191  	c.Assert(err, IsNil)
   192  	defer reader.Close()
   193  
   194  	for i, testCase := range cases {
   195  		c.Assert(reader.ReadRow(), IsNil)
   196  		vals := []types.Datum{types.NewCollationStringDatum(testCase[1].(string), "", 0)}
   197  		if i%2 == 0 {
   198  			vals = append(vals, vals[0])
   199  		} else {
   200  			vals = append(vals, types.Datum{})
   201  		}
   202  		// because we always reuse the datums in reader.lastRow.Row, so we can't directly
   203  		// compare will `DeepEqual` here
   204  		c.Assert(len(reader.lastRow.Row), Equals, len(vals))
   205  		for i, val := range vals {
   206  			c.Assert(reader.lastRow.Row[i].Kind(), Equals, val.Kind())
   207  			c.Assert(reader.lastRow.Row[i].GetValue(), Equals, val.GetValue())
   208  		}
   209  	}
   210  }
   211  
   212  func (s testParquetParserSuite) TestParquetAurora(c *C) {
   213  	store, err := storage.NewLocalStorage("examples")
   214  	c.Assert(err, IsNil)
   215  
   216  	fileName := "test.parquet"
   217  	r, err := store.Open(context.TODO(), fileName)
   218  	c.Assert(err, IsNil)
   219  	parser, err := NewParquetParser(context.TODO(), store, r, fileName)
   220  	c.Assert(err, IsNil)
   221  
   222  	c.Assert(parser.Columns(), DeepEquals, []string{"id", "val1", "val2", "d1", "d2", "d3", "d4", "d5", "d6"})
   223  
   224  	expectedRes := [][]interface{}{
   225  		{int64(1), int64(1), "0", int64(123), "1.23", "0.00000001", "1234567890", "123", "1.23000000"},
   226  		{
   227  			int64(2), int64(123456), "0", int64(123456), "9999.99", "0.12345678", "99999999999999999999",
   228  			"999999999999999999999999999999999999", "99999999999999999999.99999999",
   229  		},
   230  		{
   231  			int64(3), int64(123456), "0", int64(-123456), "-9999.99", "-0.12340000", "-99999999999999999999",
   232  			"-999999999999999999999999999999999999", "-99999999999999999999.99999999",
   233  		},
   234  		{
   235  			int64(4), int64(1), "0", int64(123), "1.23", "0.00000001", "1234567890", "123", "1.23000000",
   236  		},
   237  		{
   238  			int64(5), int64(123456), "0", int64(123456), "9999.99", "0.12345678", "12345678901234567890",
   239  			"123456789012345678901234567890123456", "99999999999999999999.99999999",
   240  		},
   241  		{
   242  			int64(6), int64(123456), "0", int64(-123456), "-9999.99", "-0.12340000",
   243  			"-12345678901234567890", "-123456789012345678901234567890123456",
   244  			"-99999999999999999999.99999999",
   245  		},
   246  	}
   247  
   248  	for i := 0; i < len(expectedRes); i++ {
   249  		err = parser.ReadRow()
   250  		c.Assert(err, IsNil)
   251  		expectedValues := expectedRes[i]
   252  		row := parser.LastRow().Row
   253  		c.Assert(len(expectedValues), Equals, len(row))
   254  		for j := 0; j < len(row); j++ {
   255  			switch v := expectedValues[j].(type) {
   256  			case int64:
   257  				c.Assert(v, Equals, row[j].GetInt64())
   258  			case string:
   259  				c.Assert(v, Equals, row[j].GetString())
   260  			default:
   261  				c.Error("unexpected value: ", expectedValues[j])
   262  			}
   263  		}
   264  	}
   265  
   266  	c.Assert(parser.ReadRow(), Equals, io.EOF)
   267  }