github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/region_test.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump_test 15 16 import ( 17 "context" 18 "os" 19 "path/filepath" 20 21 "github.com/pingcap/br/pkg/storage" 22 23 . "github.com/pingcap/check" 24 25 "github.com/pingcap/br/pkg/lightning/config" 26 . "github.com/pingcap/br/pkg/lightning/mydump" 27 "github.com/pingcap/br/pkg/lightning/worker" 28 ) 29 30 var _ = Suite(&testMydumpRegionSuite{}) 31 32 type testMydumpRegionSuite struct{} 33 34 func (s *testMydumpRegionSuite) SetUpSuite(c *C) {} 35 func (s *testMydumpRegionSuite) TearDownSuite(c *C) {} 36 37 // var expectedTuplesCount = map[string]int64{ 38 // "i": 1, 39 // "report_case_high_risk": 1, 40 // "tbl_autoid": 10000, 41 // "tbl_multi_index": 10000, 42 // } 43 44 /* 45 TODO : test with specified 'regionBlockSize' ... 46 */ 47 func (s *testMydumpRegionSuite) TestTableRegion(c *C) { 48 cfg := newConfigWithSourceDir("./examples") 49 loader, _ := NewMyDumpLoader(context.Background(), cfg) 50 dbMeta := loader.GetDatabases()[0] 51 52 ioWorkers := worker.NewPool(context.Background(), 1, "io") 53 for _, meta := range dbMeta.Tables { 54 regions, err := MakeTableRegions(context.Background(), meta, 1, cfg, ioWorkers, loader.GetStore()) 55 c.Assert(err, IsNil) 56 57 // check - region-size vs file-size 58 var tolFileSize int64 = 0 59 for _, file := range meta.DataFiles { 60 tolFileSize += file.FileMeta.FileSize 61 } 62 var tolRegionSize int64 = 0 63 for _, region := range regions { 64 tolRegionSize += region.Size() 65 } 66 c.Assert(tolRegionSize, Equals, tolFileSize) 67 68 // // check - rows num 69 // var tolRows int64 = 0 70 // for _, region := range regions { 71 // tolRows += region.Rows() 72 // } 73 // c.Assert(tolRows, Equals, expectedTuplesCount[table]) 74 75 // check - range 76 regionNum := len(regions) 77 preReg := regions[0] 78 for i := 1; i < regionNum; i++ { 79 reg := regions[i] 80 if preReg.FileMeta.Path == reg.FileMeta.Path { 81 c.Assert(reg.Offset(), Equals, preReg.Offset()+preReg.Size()) 82 c.Assert(reg.RowIDMin(), Equals, preReg.RowIDMin()+preReg.Rows()) 83 } else { 84 c.Assert(reg.Offset, Equals, 0) 85 c.Assert(reg.RowIDMin(), Equals, 1) 86 } 87 preReg = reg 88 } 89 } 90 } 91 92 func (s *testMydumpRegionSuite) TestAllocateEngineIDs(c *C) { 93 dataFileSizes := make([]float64, 700) 94 for i := range dataFileSizes { 95 dataFileSizes[i] = 1.0 96 } 97 filesRegions := make([]*TableRegion, 0, len(dataFileSizes)) 98 for range dataFileSizes { 99 filesRegions = append(filesRegions, new(TableRegion)) 100 } 101 102 checkEngineSizes := func(what string, expected map[int32]int) { 103 actual := make(map[int32]int) 104 for _, region := range filesRegions { 105 actual[region.EngineID]++ 106 } 107 c.Assert(actual, DeepEquals, expected, Commentf("%s", what)) 108 } 109 110 // Batch size > Total size => Everything in the zero batch. 111 AllocateEngineIDs(filesRegions, dataFileSizes, 1000, 0.5, 1000) 112 checkEngineSizes("no batching", map[int32]int{ 113 0: 700, 114 }) 115 116 // Allocate 3 engines. 117 AllocateEngineIDs(filesRegions, dataFileSizes, 200, 0.5, 1000) 118 checkEngineSizes("batch size = 200", map[int32]int{ 119 0: 170, 120 1: 213, 121 2: 317, 122 }) 123 124 // Allocate 3 engines with an alternative ratio 125 AllocateEngineIDs(filesRegions, dataFileSizes, 200, 0.6, 1000) 126 checkEngineSizes("batch size = 200, ratio = 0.6", map[int32]int{ 127 0: 160, 128 1: 208, 129 2: 332, 130 }) 131 132 // Allocate 5 engines. 133 AllocateEngineIDs(filesRegions, dataFileSizes, 100, 0.5, 1000) 134 checkEngineSizes("batch size = 100", map[int32]int{ 135 0: 93, 136 1: 105, 137 2: 122, 138 3: 153, 139 4: 227, 140 }) 141 142 // Number of engines > table concurrency 143 AllocateEngineIDs(filesRegions, dataFileSizes, 50, 0.5, 4) 144 checkEngineSizes("batch size = 50, limit table conc = 4", map[int32]int{ 145 0: 50, 146 1: 59, 147 2: 73, 148 3: 110, 149 4: 50, 150 5: 50, 151 6: 50, 152 7: 50, 153 8: 50, 154 9: 50, 155 10: 50, 156 11: 50, 157 12: 8, 158 }) 159 160 // Zero ratio = Uniform 161 AllocateEngineIDs(filesRegions, dataFileSizes, 100, 0.0, 1000) 162 checkEngineSizes("batch size = 100, ratio = 0", map[int32]int{ 163 0: 100, 164 1: 100, 165 2: 100, 166 3: 100, 167 4: 100, 168 5: 100, 169 6: 100, 170 }) 171 } 172 173 func (s *testMydumpRegionSuite) TestSplitLargeFile(c *C) { 174 meta := &MDTableMeta{ 175 DB: "csv", 176 Name: "large_csv_file", 177 } 178 cfg := &config.Config{ 179 Mydumper: config.MydumperRuntime{ 180 ReadBlockSize: config.ReadBlockSize, 181 CSV: config.CSVConfig{ 182 Separator: ",", 183 Delimiter: "", 184 Header: true, 185 TrimLastSep: false, 186 NotNull: false, 187 Null: "NULL", 188 BackslashEscape: true, 189 }, 190 StrictFormat: true, 191 Filter: []string{"*.*"}, 192 }, 193 } 194 filePath := "./csv/split_large_file.csv" 195 dataFileInfo, err := os.Stat(filePath) 196 c.Assert(err, IsNil) 197 fileSize := dataFileInfo.Size() 198 fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: filePath, Type: SourceTypeCSV, FileSize: fileSize}} 199 colCnt := int64(3) 200 columns := []string{"a", "b", "c"} 201 for _, tc := range []struct { 202 maxRegionSize config.ByteSize 203 offsets [][]int64 204 }{ 205 {1, [][]int64{{6, 12}, {12, 18}, {18, 24}, {24, 30}}}, 206 {6, [][]int64{{6, 18}, {18, 30}}}, 207 {8, [][]int64{{6, 18}, {18, 30}}}, 208 {12, [][]int64{{6, 24}, {24, 30}}}, 209 {13, [][]int64{{6, 24}, {24, 30}}}, 210 {18, [][]int64{{6, 30}}}, 211 {19, [][]int64{{6, 30}}}, 212 } { 213 cfg.Mydumper.MaxRegionSize = tc.maxRegionSize 214 prevRowIdxMax := int64(0) 215 ioWorker := worker.NewPool(context.Background(), 4, "io") 216 217 store, err := storage.NewLocalStorage(".") 218 c.Assert(err, IsNil) 219 220 _, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store) 221 c.Assert(err, IsNil) 222 c.Assert(regions, HasLen, len(tc.offsets)) 223 for i := range tc.offsets { 224 c.Assert(regions[i].Chunk.Offset, Equals, tc.offsets[i][0]) 225 c.Assert(regions[i].Chunk.EndOffset, Equals, tc.offsets[i][1]) 226 c.Assert(regions[i].Chunk.Columns, DeepEquals, columns) 227 } 228 } 229 } 230 231 func (s *testMydumpRegionSuite) TestSplitLargeFileNoNewLineAtEOF(c *C) { 232 meta := &MDTableMeta{ 233 DB: "csv", 234 Name: "large_csv_file", 235 } 236 cfg := &config.Config{ 237 Mydumper: config.MydumperRuntime{ 238 ReadBlockSize: config.ReadBlockSize, 239 CSV: config.CSVConfig{ 240 Separator: ",", 241 Delimiter: "", 242 Header: true, 243 TrimLastSep: false, 244 NotNull: false, 245 Null: "NULL", 246 BackslashEscape: true, 247 }, 248 StrictFormat: true, 249 Filter: []string{"*.*"}, 250 MaxRegionSize: 1, 251 }, 252 } 253 254 dir := c.MkDir() 255 256 fileName := "test.csv" 257 filePath := filepath.Join(dir, fileName) 258 259 content := []byte("a,b\r\n123,456\r\n789,101") 260 err := os.WriteFile(filePath, content, 0o644) 261 c.Assert(err, IsNil) 262 263 dataFileInfo, err := os.Stat(filePath) 264 c.Assert(err, IsNil) 265 fileSize := dataFileInfo.Size() 266 fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}} 267 colCnt := int64(2) 268 columns := []string{"a", "b"} 269 prevRowIdxMax := int64(0) 270 ioWorker := worker.NewPool(context.Background(), 4, "io") 271 272 store, err := storage.NewLocalStorage(dir) 273 c.Assert(err, IsNil) 274 275 offsets := [][]int64{{4, 13}, {13, 21}} 276 277 _, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store) 278 c.Assert(err, IsNil) 279 c.Assert(regions, HasLen, len(offsets)) 280 for i := range offsets { 281 c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0]) 282 c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1]) 283 c.Assert(regions[i].Chunk.Columns, DeepEquals, columns) 284 } 285 } 286 287 func (s *testMydumpRegionSuite) TestSplitLargeFileWithCustomTerminator(c *C) { 288 meta := &MDTableMeta{ 289 DB: "csv", 290 Name: "large_csv_with_custom_terminator", 291 } 292 cfg := &config.Config{ 293 Mydumper: config.MydumperRuntime{ 294 ReadBlockSize: config.ReadBlockSize, 295 CSV: config.CSVConfig{ 296 Separator: "|+|", 297 Terminator: "|+|\n", 298 }, 299 StrictFormat: true, 300 Filter: []string{"*.*"}, 301 MaxRegionSize: 1, 302 }, 303 } 304 305 dir := c.MkDir() 306 307 fileName := "test2.csv" 308 filePath := filepath.Join(dir, fileName) 309 310 content := []byte("5|+|abc\ndef\nghi|+|6|+|\n7|+|xyz|+|8|+|\n9|+||+|10") 311 err := os.WriteFile(filePath, content, 0o644) 312 c.Assert(err, IsNil) 313 314 dataFileInfo, err := os.Stat(filePath) 315 c.Assert(err, IsNil) 316 fileSize := dataFileInfo.Size() 317 fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}} 318 colCnt := int64(3) 319 prevRowIdxMax := int64(0) 320 ioWorker := worker.NewPool(context.Background(), 4, "io") 321 322 store, err := storage.NewLocalStorage(dir) 323 c.Assert(err, IsNil) 324 325 offsets := [][]int64{{0, 23}, {23, 38}, {38, 47}} 326 327 _, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store) 328 c.Assert(err, IsNil) 329 c.Assert(regions, HasLen, len(offsets)) 330 for i := range offsets { 331 c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0]) 332 c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1]) 333 } 334 } 335 336 func (s *testMydumpRegionSuite) TestSplitLargeFileOnlyOneChunk(c *C) { 337 meta := &MDTableMeta{ 338 DB: "csv", 339 Name: "large_csv_file", 340 } 341 cfg := &config.Config{ 342 Mydumper: config.MydumperRuntime{ 343 ReadBlockSize: config.ReadBlockSize, 344 CSV: config.CSVConfig{ 345 Separator: ",", 346 Delimiter: "", 347 Header: true, 348 TrimLastSep: false, 349 NotNull: false, 350 Null: "NULL", 351 BackslashEscape: true, 352 }, 353 StrictFormat: true, 354 Filter: []string{"*.*"}, 355 MaxRegionSize: 15, 356 }, 357 } 358 359 dir := c.MkDir() 360 361 fileName := "test.csv" 362 filePath := filepath.Join(dir, fileName) 363 364 content := []byte("field1,field2\r\n123,456\r\n") 365 err := os.WriteFile(filePath, content, 0o644) 366 c.Assert(err, IsNil) 367 368 dataFileInfo, err := os.Stat(filePath) 369 c.Assert(err, IsNil) 370 fileSize := dataFileInfo.Size() 371 fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}} 372 colCnt := int64(2) 373 columns := []string{"field1", "field2"} 374 prevRowIdxMax := int64(0) 375 ioWorker := worker.NewPool(context.Background(), 4, "io") 376 377 store, err := storage.NewLocalStorage(dir) 378 c.Assert(err, IsNil) 379 380 offsets := [][]int64{{14, 24}} 381 382 _, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store) 383 c.Assert(err, IsNil) 384 c.Assert(regions, HasLen, len(offsets)) 385 for i := range offsets { 386 c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0]) 387 c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1]) 388 c.Assert(regions[i].Chunk.Columns, DeepEquals, columns) 389 } 390 }