github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/region_test.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package mydump_test
    15  
    16  import (
    17  	"context"
    18  	"os"
    19  	"path/filepath"
    20  
    21  	"github.com/pingcap/br/pkg/storage"
    22  
    23  	. "github.com/pingcap/check"
    24  
    25  	"github.com/pingcap/br/pkg/lightning/config"
    26  	. "github.com/pingcap/br/pkg/lightning/mydump"
    27  	"github.com/pingcap/br/pkg/lightning/worker"
    28  )
    29  
    30  var _ = Suite(&testMydumpRegionSuite{})
    31  
    32  type testMydumpRegionSuite struct{}
    33  
    34  func (s *testMydumpRegionSuite) SetUpSuite(c *C)    {}
    35  func (s *testMydumpRegionSuite) TearDownSuite(c *C) {}
    36  
    37  // var expectedTuplesCount = map[string]int64{
    38  // 	"i":                     1,
    39  // 	"report_case_high_risk": 1,
    40  // 	"tbl_autoid":            10000,
    41  // 	"tbl_multi_index":       10000,
    42  // }
    43  
    44  /*
    45  	TODO : test with specified 'regionBlockSize' ...
    46  */
    47  func (s *testMydumpRegionSuite) TestTableRegion(c *C) {
    48  	cfg := newConfigWithSourceDir("./examples")
    49  	loader, _ := NewMyDumpLoader(context.Background(), cfg)
    50  	dbMeta := loader.GetDatabases()[0]
    51  
    52  	ioWorkers := worker.NewPool(context.Background(), 1, "io")
    53  	for _, meta := range dbMeta.Tables {
    54  		regions, err := MakeTableRegions(context.Background(), meta, 1, cfg, ioWorkers, loader.GetStore())
    55  		c.Assert(err, IsNil)
    56  
    57  		// check - region-size vs file-size
    58  		var tolFileSize int64 = 0
    59  		for _, file := range meta.DataFiles {
    60  			tolFileSize += file.FileMeta.FileSize
    61  		}
    62  		var tolRegionSize int64 = 0
    63  		for _, region := range regions {
    64  			tolRegionSize += region.Size()
    65  		}
    66  		c.Assert(tolRegionSize, Equals, tolFileSize)
    67  
    68  		// // check - rows num
    69  		// var tolRows int64 = 0
    70  		// for _, region := range regions {
    71  		// 	tolRows += region.Rows()
    72  		// }
    73  		// c.Assert(tolRows, Equals, expectedTuplesCount[table])
    74  
    75  		// check - range
    76  		regionNum := len(regions)
    77  		preReg := regions[0]
    78  		for i := 1; i < regionNum; i++ {
    79  			reg := regions[i]
    80  			if preReg.FileMeta.Path == reg.FileMeta.Path {
    81  				c.Assert(reg.Offset(), Equals, preReg.Offset()+preReg.Size())
    82  				c.Assert(reg.RowIDMin(), Equals, preReg.RowIDMin()+preReg.Rows())
    83  			} else {
    84  				c.Assert(reg.Offset, Equals, 0)
    85  				c.Assert(reg.RowIDMin(), Equals, 1)
    86  			}
    87  			preReg = reg
    88  		}
    89  	}
    90  }
    91  
    92  func (s *testMydumpRegionSuite) TestAllocateEngineIDs(c *C) {
    93  	dataFileSizes := make([]float64, 700)
    94  	for i := range dataFileSizes {
    95  		dataFileSizes[i] = 1.0
    96  	}
    97  	filesRegions := make([]*TableRegion, 0, len(dataFileSizes))
    98  	for range dataFileSizes {
    99  		filesRegions = append(filesRegions, new(TableRegion))
   100  	}
   101  
   102  	checkEngineSizes := func(what string, expected map[int32]int) {
   103  		actual := make(map[int32]int)
   104  		for _, region := range filesRegions {
   105  			actual[region.EngineID]++
   106  		}
   107  		c.Assert(actual, DeepEquals, expected, Commentf("%s", what))
   108  	}
   109  
   110  	// Batch size > Total size => Everything in the zero batch.
   111  	AllocateEngineIDs(filesRegions, dataFileSizes, 1000, 0.5, 1000)
   112  	checkEngineSizes("no batching", map[int32]int{
   113  		0: 700,
   114  	})
   115  
   116  	// Allocate 3 engines.
   117  	AllocateEngineIDs(filesRegions, dataFileSizes, 200, 0.5, 1000)
   118  	checkEngineSizes("batch size = 200", map[int32]int{
   119  		0: 170,
   120  		1: 213,
   121  		2: 317,
   122  	})
   123  
   124  	// Allocate 3 engines with an alternative ratio
   125  	AllocateEngineIDs(filesRegions, dataFileSizes, 200, 0.6, 1000)
   126  	checkEngineSizes("batch size = 200, ratio = 0.6", map[int32]int{
   127  		0: 160,
   128  		1: 208,
   129  		2: 332,
   130  	})
   131  
   132  	// Allocate 5 engines.
   133  	AllocateEngineIDs(filesRegions, dataFileSizes, 100, 0.5, 1000)
   134  	checkEngineSizes("batch size = 100", map[int32]int{
   135  		0: 93,
   136  		1: 105,
   137  		2: 122,
   138  		3: 153,
   139  		4: 227,
   140  	})
   141  
   142  	// Number of engines > table concurrency
   143  	AllocateEngineIDs(filesRegions, dataFileSizes, 50, 0.5, 4)
   144  	checkEngineSizes("batch size = 50, limit table conc = 4", map[int32]int{
   145  		0:  50,
   146  		1:  59,
   147  		2:  73,
   148  		3:  110,
   149  		4:  50,
   150  		5:  50,
   151  		6:  50,
   152  		7:  50,
   153  		8:  50,
   154  		9:  50,
   155  		10: 50,
   156  		11: 50,
   157  		12: 8,
   158  	})
   159  
   160  	// Zero ratio = Uniform
   161  	AllocateEngineIDs(filesRegions, dataFileSizes, 100, 0.0, 1000)
   162  	checkEngineSizes("batch size = 100, ratio = 0", map[int32]int{
   163  		0: 100,
   164  		1: 100,
   165  		2: 100,
   166  		3: 100,
   167  		4: 100,
   168  		5: 100,
   169  		6: 100,
   170  	})
   171  }
   172  
   173  func (s *testMydumpRegionSuite) TestSplitLargeFile(c *C) {
   174  	meta := &MDTableMeta{
   175  		DB:   "csv",
   176  		Name: "large_csv_file",
   177  	}
   178  	cfg := &config.Config{
   179  		Mydumper: config.MydumperRuntime{
   180  			ReadBlockSize: config.ReadBlockSize,
   181  			CSV: config.CSVConfig{
   182  				Separator:       ",",
   183  				Delimiter:       "",
   184  				Header:          true,
   185  				TrimLastSep:     false,
   186  				NotNull:         false,
   187  				Null:            "NULL",
   188  				BackslashEscape: true,
   189  			},
   190  			StrictFormat: true,
   191  			Filter:       []string{"*.*"},
   192  		},
   193  	}
   194  	filePath := "./csv/split_large_file.csv"
   195  	dataFileInfo, err := os.Stat(filePath)
   196  	c.Assert(err, IsNil)
   197  	fileSize := dataFileInfo.Size()
   198  	fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: filePath, Type: SourceTypeCSV, FileSize: fileSize}}
   199  	colCnt := int64(3)
   200  	columns := []string{"a", "b", "c"}
   201  	for _, tc := range []struct {
   202  		maxRegionSize config.ByteSize
   203  		offsets       [][]int64
   204  	}{
   205  		{1, [][]int64{{6, 12}, {12, 18}, {18, 24}, {24, 30}}},
   206  		{6, [][]int64{{6, 18}, {18, 30}}},
   207  		{8, [][]int64{{6, 18}, {18, 30}}},
   208  		{12, [][]int64{{6, 24}, {24, 30}}},
   209  		{13, [][]int64{{6, 24}, {24, 30}}},
   210  		{18, [][]int64{{6, 30}}},
   211  		{19, [][]int64{{6, 30}}},
   212  	} {
   213  		cfg.Mydumper.MaxRegionSize = tc.maxRegionSize
   214  		prevRowIdxMax := int64(0)
   215  		ioWorker := worker.NewPool(context.Background(), 4, "io")
   216  
   217  		store, err := storage.NewLocalStorage(".")
   218  		c.Assert(err, IsNil)
   219  
   220  		_, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store)
   221  		c.Assert(err, IsNil)
   222  		c.Assert(regions, HasLen, len(tc.offsets))
   223  		for i := range tc.offsets {
   224  			c.Assert(regions[i].Chunk.Offset, Equals, tc.offsets[i][0])
   225  			c.Assert(regions[i].Chunk.EndOffset, Equals, tc.offsets[i][1])
   226  			c.Assert(regions[i].Chunk.Columns, DeepEquals, columns)
   227  		}
   228  	}
   229  }
   230  
   231  func (s *testMydumpRegionSuite) TestSplitLargeFileNoNewLineAtEOF(c *C) {
   232  	meta := &MDTableMeta{
   233  		DB:   "csv",
   234  		Name: "large_csv_file",
   235  	}
   236  	cfg := &config.Config{
   237  		Mydumper: config.MydumperRuntime{
   238  			ReadBlockSize: config.ReadBlockSize,
   239  			CSV: config.CSVConfig{
   240  				Separator:       ",",
   241  				Delimiter:       "",
   242  				Header:          true,
   243  				TrimLastSep:     false,
   244  				NotNull:         false,
   245  				Null:            "NULL",
   246  				BackslashEscape: true,
   247  			},
   248  			StrictFormat:  true,
   249  			Filter:        []string{"*.*"},
   250  			MaxRegionSize: 1,
   251  		},
   252  	}
   253  
   254  	dir := c.MkDir()
   255  
   256  	fileName := "test.csv"
   257  	filePath := filepath.Join(dir, fileName)
   258  
   259  	content := []byte("a,b\r\n123,456\r\n789,101")
   260  	err := os.WriteFile(filePath, content, 0o644)
   261  	c.Assert(err, IsNil)
   262  
   263  	dataFileInfo, err := os.Stat(filePath)
   264  	c.Assert(err, IsNil)
   265  	fileSize := dataFileInfo.Size()
   266  	fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}}
   267  	colCnt := int64(2)
   268  	columns := []string{"a", "b"}
   269  	prevRowIdxMax := int64(0)
   270  	ioWorker := worker.NewPool(context.Background(), 4, "io")
   271  
   272  	store, err := storage.NewLocalStorage(dir)
   273  	c.Assert(err, IsNil)
   274  
   275  	offsets := [][]int64{{4, 13}, {13, 21}}
   276  
   277  	_, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store)
   278  	c.Assert(err, IsNil)
   279  	c.Assert(regions, HasLen, len(offsets))
   280  	for i := range offsets {
   281  		c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0])
   282  		c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1])
   283  		c.Assert(regions[i].Chunk.Columns, DeepEquals, columns)
   284  	}
   285  }
   286  
   287  func (s *testMydumpRegionSuite) TestSplitLargeFileWithCustomTerminator(c *C) {
   288  	meta := &MDTableMeta{
   289  		DB:   "csv",
   290  		Name: "large_csv_with_custom_terminator",
   291  	}
   292  	cfg := &config.Config{
   293  		Mydumper: config.MydumperRuntime{
   294  			ReadBlockSize: config.ReadBlockSize,
   295  			CSV: config.CSVConfig{
   296  				Separator:  "|+|",
   297  				Terminator: "|+|\n",
   298  			},
   299  			StrictFormat:  true,
   300  			Filter:        []string{"*.*"},
   301  			MaxRegionSize: 1,
   302  		},
   303  	}
   304  
   305  	dir := c.MkDir()
   306  
   307  	fileName := "test2.csv"
   308  	filePath := filepath.Join(dir, fileName)
   309  
   310  	content := []byte("5|+|abc\ndef\nghi|+|6|+|\n7|+|xyz|+|8|+|\n9|+||+|10")
   311  	err := os.WriteFile(filePath, content, 0o644)
   312  	c.Assert(err, IsNil)
   313  
   314  	dataFileInfo, err := os.Stat(filePath)
   315  	c.Assert(err, IsNil)
   316  	fileSize := dataFileInfo.Size()
   317  	fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}}
   318  	colCnt := int64(3)
   319  	prevRowIdxMax := int64(0)
   320  	ioWorker := worker.NewPool(context.Background(), 4, "io")
   321  
   322  	store, err := storage.NewLocalStorage(dir)
   323  	c.Assert(err, IsNil)
   324  
   325  	offsets := [][]int64{{0, 23}, {23, 38}, {38, 47}}
   326  
   327  	_, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store)
   328  	c.Assert(err, IsNil)
   329  	c.Assert(regions, HasLen, len(offsets))
   330  	for i := range offsets {
   331  		c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0])
   332  		c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1])
   333  	}
   334  }
   335  
   336  func (s *testMydumpRegionSuite) TestSplitLargeFileOnlyOneChunk(c *C) {
   337  	meta := &MDTableMeta{
   338  		DB:   "csv",
   339  		Name: "large_csv_file",
   340  	}
   341  	cfg := &config.Config{
   342  		Mydumper: config.MydumperRuntime{
   343  			ReadBlockSize: config.ReadBlockSize,
   344  			CSV: config.CSVConfig{
   345  				Separator:       ",",
   346  				Delimiter:       "",
   347  				Header:          true,
   348  				TrimLastSep:     false,
   349  				NotNull:         false,
   350  				Null:            "NULL",
   351  				BackslashEscape: true,
   352  			},
   353  			StrictFormat:  true,
   354  			Filter:        []string{"*.*"},
   355  			MaxRegionSize: 15,
   356  		},
   357  	}
   358  
   359  	dir := c.MkDir()
   360  
   361  	fileName := "test.csv"
   362  	filePath := filepath.Join(dir, fileName)
   363  
   364  	content := []byte("field1,field2\r\n123,456\r\n")
   365  	err := os.WriteFile(filePath, content, 0o644)
   366  	c.Assert(err, IsNil)
   367  
   368  	dataFileInfo, err := os.Stat(filePath)
   369  	c.Assert(err, IsNil)
   370  	fileSize := dataFileInfo.Size()
   371  	fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}}
   372  	colCnt := int64(2)
   373  	columns := []string{"field1", "field2"}
   374  	prevRowIdxMax := int64(0)
   375  	ioWorker := worker.NewPool(context.Background(), 4, "io")
   376  
   377  	store, err := storage.NewLocalStorage(dir)
   378  	c.Assert(err, IsNil)
   379  
   380  	offsets := [][]int64{{14, 24}}
   381  
   382  	_, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store)
   383  	c.Assert(err, IsNil)
   384  	c.Assert(regions, HasLen, len(offsets))
   385  	for i := range offsets {
   386  		c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0])
   387  		c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1])
   388  		c.Assert(regions[i].Chunk.Columns, DeepEquals, columns)
   389  	}
   390  }