github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/parquet_parser_test.go (about) 1 package mydump 2 3 import ( 4 "context" 5 "io" 6 "path/filepath" 7 "strconv" 8 "time" 9 10 "github.com/pingcap/br/pkg/storage" 11 . "github.com/pingcap/check" 12 "github.com/pingcap/tidb/types" 13 "github.com/xitongsys/parquet-go-source/local" 14 writer2 "github.com/xitongsys/parquet-go/writer" 15 ) 16 17 type testParquetParserSuite struct{} 18 19 var _ = Suite(testParquetParserSuite{}) 20 21 func (s testParquetParserSuite) TestParquetParser(c *C) { 22 type Test struct { 23 S string `parquet:"name=sS, type=UTF8, encoding=PLAIN_DICTIONARY"` 24 A int32 `parquet:"name=a_A, type=INT32"` 25 } 26 27 dir := c.MkDir() 28 // prepare data 29 name := "test123.parquet" 30 testPath := filepath.Join(dir, name) 31 pf, err := local.NewLocalFileWriter(testPath) 32 c.Assert(err, IsNil) 33 test := &Test{} 34 writer, err := writer2.NewParquetWriter(pf, test, 2) 35 c.Assert(err, IsNil) 36 37 for i := 0; i < 100; i++ { 38 test.A = int32(i) 39 test.S = strconv.Itoa(i) 40 c.Assert(writer.Write(test), IsNil) 41 } 42 43 c.Assert(writer.WriteStop(), IsNil) 44 c.Assert(pf.Close(), IsNil) 45 46 store, err := storage.NewLocalStorage(dir) 47 c.Assert(err, IsNil) 48 r, err := store.Open(context.TODO(), name) 49 c.Assert(err, IsNil) 50 reader, err := NewParquetParser(context.TODO(), store, r, name) 51 c.Assert(err, IsNil) 52 defer reader.Close() 53 54 c.Assert(reader.Columns(), DeepEquals, []string{"ss", "a_a"}) 55 56 verifyRow := func(i int) { 57 c.Assert(reader.lastRow.RowID, Equals, int64(i+1)) 58 c.Assert(len(reader.lastRow.Row), Equals, 2) 59 c.Assert(reader.lastRow.Row[0], DeepEquals, types.NewCollationStringDatum(strconv.Itoa(i), "", 0)) 60 c.Assert(reader.lastRow.Row[1], DeepEquals, types.NewIntDatum(int64(i))) 61 } 62 63 // test read some rows 64 for i := 0; i < 10; i++ { 65 c.Assert(reader.ReadRow(), IsNil) 66 verifyRow(i) 67 } 68 69 // test set pos to pos < curpos + batchReadRowSize 70 c.Assert(reader.SetPos(15, 15), IsNil) 71 c.Assert(reader.ReadRow(), IsNil) 72 verifyRow(15) 73 74 // test set pos to pos > curpos + batchReadRowSize 75 c.Assert(reader.SetPos(80, 80), IsNil) 76 for i := 80; i < 100; i++ { 77 c.Assert(reader.ReadRow(), IsNil) 78 verifyRow(i) 79 } 80 81 c.Assert(reader.ReadRow(), Equals, io.EOF) 82 } 83 84 func (s testParquetParserSuite) TestParquetVariousTypes(c *C) { 85 // those deprecated TIME/TIMESTAMP types depend on the local timezone! 86 prevTZ := time.Local 87 time.Local = time.FixedZone("UTC+8", 8*60*60) 88 defer func() { 89 time.Local = prevTZ 90 }() 91 92 type Test struct { 93 Date int32 `parquet:"name=date, type=DATE"` 94 TimeMillis int32 `parquet:"name=timemillis, type=TIME_MILLIS"` 95 TimeMicros int64 `parquet:"name=timemicros, type=TIME_MICROS"` 96 TimestampMillis int64 `parquet:"name=timestampmillis, type=TIMESTAMP_MILLIS"` 97 TimestampMicros int64 `parquet:"name=timestampmicros, type=TIMESTAMP_MICROS"` 98 99 Decimal1 int32 `parquet:"name=decimal1, type=DECIMAL, scale=2, precision=9, basetype=INT32"` 100 Decimal2 int32 `parquet:"name=decimal2, type=DECIMAL, scale=4, precision=4, basetype=INT32"` 101 Decimal3 int64 `parquet:"name=decimal3, type=DECIMAL, scale=2, precision=18, basetype=INT64"` 102 Decimal4 string `parquet:"name=decimal4, type=DECIMAL, scale=2, precision=10, basetype=FIXED_LEN_BYTE_ARRAY, length=12"` 103 Decimal5 string `parquet:"name=decimal5, type=DECIMAL, scale=2, precision=20, basetype=BYTE_ARRAY"` 104 Decimal6 int32 `parquet:"name=decimal6, type=DECIMAL, scale=4, precision=4, basetype=INT32"` 105 } 106 107 dir := c.MkDir() 108 // prepare data 109 name := "test123.parquet" 110 testPath := filepath.Join(dir, name) 111 pf, err := local.NewLocalFileWriter(testPath) 112 c.Assert(err, IsNil) 113 test := &Test{} 114 writer, err := writer2.NewParquetWriter(pf, test, 2) 115 c.Assert(err, IsNil) 116 117 v := &Test{ 118 Date: 18564, //2020-10-29 119 TimeMillis: 62775123, // 17:26:15.123 (note all time are in UTC+8!) 120 TimeMicros: 62775123000, // 17:26:15.123 121 TimestampMillis: 1603963672356, // 2020-10-29T17:27:52.356 122 TimestampMicros: 1603963672356956, //2020-10-29T17:27:52.356956 123 Decimal1: -12345678, // -123456.78 124 Decimal2: 456, // 0.0456 125 Decimal3: 123456789012345678, //1234567890123456.78 126 Decimal4: "-12345678.09", 127 Decimal5: "-1234567890123456.78", 128 Decimal6: -1, // -0.0001 129 } 130 c.Assert(writer.Write(v), IsNil) 131 c.Assert(writer.WriteStop(), IsNil) 132 c.Assert(pf.Close(), IsNil) 133 134 store, err := storage.NewLocalStorage(dir) 135 c.Assert(err, IsNil) 136 r, err := store.Open(context.TODO(), name) 137 c.Assert(err, IsNil) 138 reader, err := NewParquetParser(context.TODO(), store, r, name) 139 c.Assert(err, IsNil) 140 defer reader.Close() 141 142 c.Assert(len(reader.columns), Equals, 11) 143 144 c.Assert(reader.ReadRow(), IsNil) 145 c.Assert(reader.lastRow.Row, DeepEquals, []types.Datum{ 146 types.NewCollationStringDatum("2020-10-29", "", 0), 147 types.NewCollationStringDatum("17:26:15.123", "", 0), 148 types.NewCollationStringDatum("17:26:15.123", "", 0), 149 types.NewCollationStringDatum("2020-10-29 17:27:52.356", "", 0), 150 types.NewCollationStringDatum("2020-10-29 17:27:52.356", "", 0), 151 types.NewCollationStringDatum("-123456.78", "", 0), 152 types.NewCollationStringDatum("0.0456", "", 0), 153 types.NewCollationStringDatum("1234567890123456.78", "", 0), 154 types.NewCollationStringDatum("-12345678.09", "", 0), 155 types.NewCollationStringDatum("-1234567890123456.78", "", 0), 156 types.NewCollationStringDatum("-0.0001", "", 0), 157 }) 158 159 type TestDecimal struct { 160 Decimal1 int32 `parquet:"name=decimal1, type=DECIMAL, scale=3, precision=5, basetype=INT32"` 161 DecimalRef *int32 `parquet:"name=decimal2, type=DECIMAL, scale=3, precision=5, basetype=INT32"` 162 } 163 164 cases := [][]interface{}{ 165 {int32(0), "0.000"}, 166 {int32(1000), "1.000"}, 167 {int32(-1000), "-1.000"}, 168 {int32(999), "0.999"}, 169 {int32(-999), "-0.999"}, 170 {int32(1), "0.001"}, 171 {int32(-1), "-0.001"}, 172 } 173 174 fileName := "test.02.parquet" 175 testPath = filepath.Join(dir, fileName) 176 pf, err = local.NewLocalFileWriter(testPath) 177 td := &TestDecimal{} 178 c.Assert(err, IsNil) 179 writer, err = writer2.NewParquetWriter(pf, td, 2) 180 c.Assert(err, IsNil) 181 for i, testCase := range cases { 182 val := testCase[0].(int32) 183 td.Decimal1 = val 184 if i%2 == 0 { 185 td.DecimalRef = &val 186 } else { 187 td.DecimalRef = nil 188 } 189 c.Assert(writer.Write(td), IsNil) 190 } 191 c.Assert(writer.WriteStop(), IsNil) 192 c.Assert(pf.Close(), IsNil) 193 194 r, err = store.Open(context.TODO(), fileName) 195 c.Assert(err, IsNil) 196 reader, err = NewParquetParser(context.TODO(), store, r, fileName) 197 c.Assert(err, IsNil) 198 defer reader.Close() 199 200 for i, testCase := range cases { 201 c.Assert(reader.ReadRow(), IsNil) 202 vals := []types.Datum{types.NewCollationStringDatum(testCase[1].(string), "", 0)} 203 if i%2 == 0 { 204 vals = append(vals, vals[0]) 205 } else { 206 vals = append(vals, types.Datum{}) 207 } 208 // because we always reuse the datums in reader.lastRow.Row, so we can't directly 209 // compare will `DeepEqual` here 210 c.Assert(len(reader.lastRow.Row), Equals, len(vals)) 211 for i, val := range vals { 212 c.Assert(reader.lastRow.Row[i].Kind(), Equals, val.Kind()) 213 c.Assert(reader.lastRow.Row[i].GetValue(), Equals, val.GetValue()) 214 } 215 } 216 }