github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/parquet_parser_test.go (about)

     1  package mydump
     2  
     3  import (
     4  	"context"
     5  	"io"
     6  	"path/filepath"
     7  	"strconv"
     8  	"time"
     9  
    10  	"github.com/pingcap/br/pkg/storage"
    11  	. "github.com/pingcap/check"
    12  	"github.com/pingcap/tidb/types"
    13  	"github.com/xitongsys/parquet-go-source/local"
    14  	writer2 "github.com/xitongsys/parquet-go/writer"
    15  )
    16  
    17  type testParquetParserSuite struct{}
    18  
    19  var _ = Suite(testParquetParserSuite{})
    20  
    21  func (s testParquetParserSuite) TestParquetParser(c *C) {
    22  	type Test struct {
    23  		S string `parquet:"name=sS, type=UTF8, encoding=PLAIN_DICTIONARY"`
    24  		A int32  `parquet:"name=a_A, type=INT32"`
    25  	}
    26  
    27  	dir := c.MkDir()
    28  	// prepare data
    29  	name := "test123.parquet"
    30  	testPath := filepath.Join(dir, name)
    31  	pf, err := local.NewLocalFileWriter(testPath)
    32  	c.Assert(err, IsNil)
    33  	test := &Test{}
    34  	writer, err := writer2.NewParquetWriter(pf, test, 2)
    35  	c.Assert(err, IsNil)
    36  
    37  	for i := 0; i < 100; i++ {
    38  		test.A = int32(i)
    39  		test.S = strconv.Itoa(i)
    40  		c.Assert(writer.Write(test), IsNil)
    41  	}
    42  
    43  	c.Assert(writer.WriteStop(), IsNil)
    44  	c.Assert(pf.Close(), IsNil)
    45  
    46  	store, err := storage.NewLocalStorage(dir)
    47  	c.Assert(err, IsNil)
    48  	r, err := store.Open(context.TODO(), name)
    49  	c.Assert(err, IsNil)
    50  	reader, err := NewParquetParser(context.TODO(), store, r, name)
    51  	c.Assert(err, IsNil)
    52  	defer reader.Close()
    53  
    54  	c.Assert(reader.Columns(), DeepEquals, []string{"ss", "a_a"})
    55  
    56  	verifyRow := func(i int) {
    57  		c.Assert(reader.lastRow.RowID, Equals, int64(i+1))
    58  		c.Assert(len(reader.lastRow.Row), Equals, 2)
    59  		c.Assert(reader.lastRow.Row[0], DeepEquals, types.NewCollationStringDatum(strconv.Itoa(i), "", 0))
    60  		c.Assert(reader.lastRow.Row[1], DeepEquals, types.NewIntDatum(int64(i)))
    61  	}
    62  
    63  	// test read some rows
    64  	for i := 0; i < 10; i++ {
    65  		c.Assert(reader.ReadRow(), IsNil)
    66  		verifyRow(i)
    67  	}
    68  
    69  	// test set pos to pos < curpos + batchReadRowSize
    70  	c.Assert(reader.SetPos(15, 15), IsNil)
    71  	c.Assert(reader.ReadRow(), IsNil)
    72  	verifyRow(15)
    73  
    74  	// test set pos to pos > curpos + batchReadRowSize
    75  	c.Assert(reader.SetPos(80, 80), IsNil)
    76  	for i := 80; i < 100; i++ {
    77  		c.Assert(reader.ReadRow(), IsNil)
    78  		verifyRow(i)
    79  	}
    80  
    81  	c.Assert(reader.ReadRow(), Equals, io.EOF)
    82  }
    83  
    84  func (s testParquetParserSuite) TestParquetVariousTypes(c *C) {
    85  	// those deprecated TIME/TIMESTAMP types depend on the local timezone!
    86  	prevTZ := time.Local
    87  	time.Local = time.FixedZone("UTC+8", 8*60*60)
    88  	defer func() {
    89  		time.Local = prevTZ
    90  	}()
    91  
    92  	type Test struct {
    93  		Date            int32 `parquet:"name=date, type=DATE"`
    94  		TimeMillis      int32 `parquet:"name=timemillis, type=TIME_MILLIS"`
    95  		TimeMicros      int64 `parquet:"name=timemicros, type=TIME_MICROS"`
    96  		TimestampMillis int64 `parquet:"name=timestampmillis, type=TIMESTAMP_MILLIS"`
    97  		TimestampMicros int64 `parquet:"name=timestampmicros, type=TIMESTAMP_MICROS"`
    98  
    99  		Decimal1 int32  `parquet:"name=decimal1, type=DECIMAL, scale=2, precision=9, basetype=INT32"`
   100  		Decimal2 int32  `parquet:"name=decimal2, type=DECIMAL, scale=4, precision=4, basetype=INT32"`
   101  		Decimal3 int64  `parquet:"name=decimal3, type=DECIMAL, scale=2, precision=18, basetype=INT64"`
   102  		Decimal4 string `parquet:"name=decimal4, type=DECIMAL, scale=2, precision=10, basetype=FIXED_LEN_BYTE_ARRAY, length=12"`
   103  		Decimal5 string `parquet:"name=decimal5, type=DECIMAL, scale=2, precision=20, basetype=BYTE_ARRAY"`
   104  		Decimal6 int32  `parquet:"name=decimal6, type=DECIMAL, scale=4, precision=4, basetype=INT32"`
   105  	}
   106  
   107  	dir := c.MkDir()
   108  	// prepare data
   109  	name := "test123.parquet"
   110  	testPath := filepath.Join(dir, name)
   111  	pf, err := local.NewLocalFileWriter(testPath)
   112  	c.Assert(err, IsNil)
   113  	test := &Test{}
   114  	writer, err := writer2.NewParquetWriter(pf, test, 2)
   115  	c.Assert(err, IsNil)
   116  
   117  	v := &Test{
   118  		Date:            18564,              //2020-10-29
   119  		TimeMillis:      62775123,           // 17:26:15.123 (note all time are in UTC+8!)
   120  		TimeMicros:      62775123000,        // 17:26:15.123
   121  		TimestampMillis: 1603963672356,      // 2020-10-29T17:27:52.356
   122  		TimestampMicros: 1603963672356956,   //2020-10-29T17:27:52.356956
   123  		Decimal1:        -12345678,          // -123456.78
   124  		Decimal2:        456,                // 0.0456
   125  		Decimal3:        123456789012345678, //1234567890123456.78
   126  		Decimal4:        "-12345678.09",
   127  		Decimal5:        "-1234567890123456.78",
   128  		Decimal6:        -1, // -0.0001
   129  	}
   130  	c.Assert(writer.Write(v), IsNil)
   131  	c.Assert(writer.WriteStop(), IsNil)
   132  	c.Assert(pf.Close(), IsNil)
   133  
   134  	store, err := storage.NewLocalStorage(dir)
   135  	c.Assert(err, IsNil)
   136  	r, err := store.Open(context.TODO(), name)
   137  	c.Assert(err, IsNil)
   138  	reader, err := NewParquetParser(context.TODO(), store, r, name)
   139  	c.Assert(err, IsNil)
   140  	defer reader.Close()
   141  
   142  	c.Assert(len(reader.columns), Equals, 11)
   143  
   144  	c.Assert(reader.ReadRow(), IsNil)
   145  	c.Assert(reader.lastRow.Row, DeepEquals, []types.Datum{
   146  		types.NewCollationStringDatum("2020-10-29", "", 0),
   147  		types.NewCollationStringDatum("17:26:15.123", "", 0),
   148  		types.NewCollationStringDatum("17:26:15.123", "", 0),
   149  		types.NewCollationStringDatum("2020-10-29 17:27:52.356", "", 0),
   150  		types.NewCollationStringDatum("2020-10-29 17:27:52.356", "", 0),
   151  		types.NewCollationStringDatum("-123456.78", "", 0),
   152  		types.NewCollationStringDatum("0.0456", "", 0),
   153  		types.NewCollationStringDatum("1234567890123456.78", "", 0),
   154  		types.NewCollationStringDatum("-12345678.09", "", 0),
   155  		types.NewCollationStringDatum("-1234567890123456.78", "", 0),
   156  		types.NewCollationStringDatum("-0.0001", "", 0),
   157  	})
   158  
   159  	type TestDecimal struct {
   160  		Decimal1   int32  `parquet:"name=decimal1, type=DECIMAL, scale=3, precision=5, basetype=INT32"`
   161  		DecimalRef *int32 `parquet:"name=decimal2, type=DECIMAL, scale=3, precision=5, basetype=INT32"`
   162  	}
   163  
   164  	cases := [][]interface{}{
   165  		{int32(0), "0.000"},
   166  		{int32(1000), "1.000"},
   167  		{int32(-1000), "-1.000"},
   168  		{int32(999), "0.999"},
   169  		{int32(-999), "-0.999"},
   170  		{int32(1), "0.001"},
   171  		{int32(-1), "-0.001"},
   172  	}
   173  
   174  	fileName := "test.02.parquet"
   175  	testPath = filepath.Join(dir, fileName)
   176  	pf, err = local.NewLocalFileWriter(testPath)
   177  	td := &TestDecimal{}
   178  	c.Assert(err, IsNil)
   179  	writer, err = writer2.NewParquetWriter(pf, td, 2)
   180  	c.Assert(err, IsNil)
   181  	for i, testCase := range cases {
   182  		val := testCase[0].(int32)
   183  		td.Decimal1 = val
   184  		if i%2 == 0 {
   185  			td.DecimalRef = &val
   186  		} else {
   187  			td.DecimalRef = nil
   188  		}
   189  		c.Assert(writer.Write(td), IsNil)
   190  	}
   191  	c.Assert(writer.WriteStop(), IsNil)
   192  	c.Assert(pf.Close(), IsNil)
   193  
   194  	r, err = store.Open(context.TODO(), fileName)
   195  	c.Assert(err, IsNil)
   196  	reader, err = NewParquetParser(context.TODO(), store, r, fileName)
   197  	c.Assert(err, IsNil)
   198  	defer reader.Close()
   199  
   200  	for i, testCase := range cases {
   201  		c.Assert(reader.ReadRow(), IsNil)
   202  		vals := []types.Datum{types.NewCollationStringDatum(testCase[1].(string), "", 0)}
   203  		if i%2 == 0 {
   204  			vals = append(vals, vals[0])
   205  		} else {
   206  			vals = append(vals, types.Datum{})
   207  		}
   208  		// because we always reuse the datums in reader.lastRow.Row, so we can't directly
   209  		// compare will `DeepEqual` here
   210  		c.Assert(len(reader.lastRow.Row), Equals, len(vals))
   211  		for i, val := range vals {
   212  			c.Assert(reader.lastRow.Row[i].Kind(), Equals, val.Kind())
   213  			c.Assert(reader.lastRow.Row[i].GetValue(), Equals, val.GetValue())
   214  		}
   215  	}
   216  }