github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/loader.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump 15 16 import ( 17 "context" 18 "path/filepath" 19 "sort" 20 21 "github.com/pingcap/errors" 22 filter "github.com/pingcap/tidb-tools/pkg/table-filter" 23 router "github.com/pingcap/tidb-tools/pkg/table-router" 24 "go.uber.org/zap" 25 26 "github.com/pingcap/br/pkg/lightning/config" 27 "github.com/pingcap/br/pkg/lightning/log" 28 "github.com/pingcap/br/pkg/storage" 29 ) 30 31 type MDDatabaseMeta struct { 32 Name string 33 SchemaFile string 34 Tables []*MDTableMeta 35 Views []*MDTableMeta 36 charSet string 37 } 38 39 type MDTableMeta struct { 40 DB string 41 Name string 42 SchemaFile FileInfo 43 DataFiles []FileInfo 44 charSet string 45 TotalSize int64 46 IndexRatio float64 47 IsRowOrdered bool 48 } 49 50 type SourceFileMeta struct { 51 Path string 52 Type SourceType 53 Compression Compression 54 SortKey string 55 FileSize int64 56 } 57 58 func (m *MDTableMeta) GetSchema(ctx context.Context, store storage.ExternalStorage) (string, error) { 59 schema, err := ExportStatement(ctx, store, m.SchemaFile, m.charSet) 60 if err != nil { 61 log.L().Error("failed to extract table schema", 62 zap.String("Path", m.SchemaFile.FileMeta.Path), 63 log.ShortError(err), 64 ) 65 return "", err 66 } 67 return string(schema), nil 68 } 69 70 /* 71 Mydumper File Loader 72 */ 73 type MDLoader struct { 74 store storage.ExternalStorage 75 dbs []*MDDatabaseMeta 76 filter filter.Filter 77 router *router.Table 78 fileRouter FileRouter 79 charSet string 80 } 81 82 type mdLoaderSetup struct { 83 loader *MDLoader 84 dbSchemas []FileInfo 85 tableSchemas []FileInfo 86 viewSchemas []FileInfo 87 tableDatas []FileInfo 88 dbIndexMap map[string]int 89 tableIndexMap map[filter.Table]int 90 } 91 92 func NewMyDumpLoader(ctx context.Context, cfg *config.Config) (*MDLoader, error) { 93 u, err := storage.ParseBackend(cfg.Mydumper.SourceDir, nil) 94 if err != nil { 95 return nil, errors.Trace(err) 96 } 97 s, err := storage.New(ctx, u, &storage.ExternalStorageOptions{}) 98 if err != nil { 99 return nil, errors.Trace(err) 100 } 101 102 return NewMyDumpLoaderWithStore(ctx, cfg, s) 103 } 104 105 func NewMyDumpLoaderWithStore(ctx context.Context, cfg *config.Config, store storage.ExternalStorage) (*MDLoader, error) { 106 var r *router.Table 107 var err error 108 109 if len(cfg.Routes) > 0 && len(cfg.Mydumper.FileRouters) > 0 { 110 return nil, errors.New("table route is deprecated, can't config both [routes] and [mydumper.files]") 111 } 112 113 if len(cfg.Routes) > 0 { 114 r, err = router.NewTableRouter(cfg.Mydumper.CaseSensitive, cfg.Routes) 115 if err != nil { 116 return nil, errors.Trace(err) 117 } 118 } 119 120 // use the legacy black-white-list if defined. otherwise use the new filter. 121 var f filter.Filter 122 if cfg.HasLegacyBlackWhiteList() { 123 f, err = filter.ParseMySQLReplicationRules(&cfg.BWList) 124 } else { 125 f, err = filter.Parse(cfg.Mydumper.Filter) 126 } 127 if err != nil { 128 return nil, errors.Annotate(err, "parse filter failed") 129 } 130 if !cfg.Mydumper.CaseSensitive { 131 f = filter.CaseInsensitive(f) 132 } 133 134 fileRouteRules := cfg.Mydumper.FileRouters 135 if cfg.Mydumper.DefaultFileRules { 136 fileRouteRules = append(fileRouteRules, defaultFileRouteRules...) 137 } 138 139 fileRouter, err := NewFileRouter(fileRouteRules) 140 if err != nil { 141 return nil, errors.Annotate(err, "parser file routing rule failed") 142 } 143 144 mdl := &MDLoader{ 145 store: store, 146 filter: f, 147 router: r, 148 charSet: cfg.Mydumper.CharacterSet, 149 fileRouter: fileRouter, 150 } 151 152 setup := mdLoaderSetup{ 153 loader: mdl, 154 dbIndexMap: make(map[string]int), 155 tableIndexMap: make(map[filter.Table]int), 156 } 157 158 if err := setup.setup(ctx, mdl.store); err != nil { 159 return nil, errors.Trace(err) 160 } 161 162 return mdl, nil 163 } 164 165 type fileType int 166 167 const ( 168 fileTypeDatabaseSchema fileType = iota 169 fileTypeTableSchema 170 fileTypeTableData 171 ) 172 173 func (ftype fileType) String() string { 174 switch ftype { 175 case fileTypeDatabaseSchema: 176 return "database schema" 177 case fileTypeTableSchema: 178 return "table schema" 179 case fileTypeTableData: 180 return "table data" 181 default: 182 return "(unknown)" 183 } 184 } 185 186 type FileInfo struct { 187 TableName filter.Table 188 FileMeta SourceFileMeta 189 } 190 191 // setup the `s.loader.dbs` slice by scanning all *.sql files inside `dir`. 192 // 193 // The database and tables are inserted in a consistent order, so creating an 194 // MDLoader twice with the same data source is going to produce the same array, 195 // even after killing Lightning. 196 // 197 // This is achieved by using `filepath.Walk` internally which guarantees the 198 // files are visited in lexicographical order (note that this does not mean the 199 // databases and tables in the end are ordered lexicographically since they may 200 // be stored in different subdirectories). 201 // 202 // Will sort tables by table size, this means that the big table is imported 203 // at the latest, which to avoid large table take a long time to import and block 204 // small table to release index worker. 205 func (s *mdLoaderSetup) setup(ctx context.Context, store storage.ExternalStorage) error { 206 /* 207 Mydumper file names format 208 db —— {db}-schema-create.sql 209 table —— {db}.{table}-schema.sql 210 sql —— {db}.{table}.{part}.sql / {db}.{table}.sql 211 */ 212 if err := s.listFiles(ctx, store); err != nil { 213 return errors.Annotate(err, "list file failed") 214 } 215 if err := s.route(); err != nil { 216 return errors.Trace(err) 217 } 218 219 // setup database schema 220 if len(s.dbSchemas) != 0 { 221 for _, fileInfo := range s.dbSchemas { 222 if _, dbExists := s.insertDB(fileInfo.TableName.Schema, fileInfo.FileMeta.Path); dbExists && s.loader.router == nil { 223 return errors.Errorf("invalid database schema file, duplicated item - %s", fileInfo.FileMeta.Path) 224 } 225 } 226 } 227 228 if len(s.tableSchemas) != 0 { 229 // setup table schema 230 for _, fileInfo := range s.tableSchemas { 231 if _, _, tableExists := s.insertTable(fileInfo); tableExists && s.loader.router == nil { 232 return errors.Errorf("invalid table schema file, duplicated item - %s", fileInfo.FileMeta.Path) 233 } 234 } 235 } 236 237 if len(s.viewSchemas) != 0 { 238 // setup view schema 239 for _, fileInfo := range s.viewSchemas { 240 _, tableExists := s.insertView(fileInfo) 241 if !tableExists { 242 // we are not expect the user only has view schema without table schema when user use dumpling to get view. 243 // remove the last `-view.sql` from path as the relate table schema file path 244 return errors.Errorf("invalid view schema file, miss host table schema for view '%s'", fileInfo.TableName.Name) 245 } 246 } 247 } 248 249 // Sql file for restore data 250 for _, fileInfo := range s.tableDatas { 251 // set a dummy `FileInfo` here without file meta because we needn't restore the table schema 252 tableMeta, _, _ := s.insertTable(FileInfo{TableName: fileInfo.TableName}) 253 tableMeta.DataFiles = append(tableMeta.DataFiles, fileInfo) 254 tableMeta.TotalSize += fileInfo.FileMeta.FileSize 255 } 256 257 for _, dbMeta := range s.loader.dbs { 258 // Put the small table in the front of the slice which can avoid large table 259 // take a long time to import and block small table to release index worker. 260 meta := dbMeta 261 sort.SliceStable(meta.Tables, func(i, j int) bool { 262 return meta.Tables[i].TotalSize < meta.Tables[j].TotalSize 263 }) 264 265 // sort each table source files by sort-key 266 for _, tbMeta := range meta.Tables { 267 dataFiles := tbMeta.DataFiles 268 sort.SliceStable(dataFiles, func(i, j int) bool { 269 return dataFiles[i].FileMeta.SortKey < dataFiles[j].FileMeta.SortKey 270 }) 271 } 272 } 273 274 return nil 275 } 276 277 func (s *mdLoaderSetup) listFiles(ctx context.Context, store storage.ExternalStorage) error { 278 // `filepath.Walk` yields the paths in a deterministic (lexicographical) order, 279 // meaning the file and chunk orders will be the same everytime it is called 280 // (as long as the source is immutable). 281 err := store.WalkDir(ctx, &storage.WalkOption{}, func(path string, size int64) error { 282 logger := log.With(zap.String("path", path)) 283 284 res, err := s.loader.fileRouter.Route(filepath.ToSlash(path)) 285 if err != nil { 286 return errors.Annotatef(err, "apply file routing on file '%s' failed", path) 287 } 288 if res == nil { 289 logger.Info("[loader] file is filtered by file router") 290 return nil 291 } 292 293 info := FileInfo{ 294 TableName: filter.Table{Schema: res.Schema, Name: res.Name}, 295 FileMeta: SourceFileMeta{Path: path, Type: res.Type, Compression: res.Compression, SortKey: res.Key, FileSize: size}, 296 } 297 298 if s.loader.shouldSkip(&info.TableName) { 299 logger.Debug("[filter] ignoring table file") 300 301 return nil 302 } 303 304 switch res.Type { 305 case SourceTypeSchemaSchema: 306 s.dbSchemas = append(s.dbSchemas, info) 307 case SourceTypeTableSchema: 308 s.tableSchemas = append(s.tableSchemas, info) 309 case SourceTypeViewSchema: 310 s.viewSchemas = append(s.viewSchemas, info) 311 case SourceTypeSQL, SourceTypeCSV, SourceTypeParquet: 312 s.tableDatas = append(s.tableDatas, info) 313 } 314 315 logger.Debug("file route result", zap.String("schema", res.Schema), 316 zap.String("table", res.Name), zap.Stringer("type", res.Type)) 317 318 return nil 319 }) 320 321 return errors.Trace(err) 322 } 323 324 func (l *MDLoader) shouldSkip(table *filter.Table) bool { 325 if len(table.Name) == 0 { 326 return !l.filter.MatchSchema(table.Schema) 327 } 328 return !l.filter.MatchTable(table.Schema, table.Name) 329 } 330 331 func (s *mdLoaderSetup) route() error { 332 r := s.loader.router 333 if r == nil { 334 return nil 335 } 336 337 type dbInfo struct { 338 fileMeta SourceFileMeta 339 count int 340 } 341 342 knownDBNames := make(map[string]dbInfo) 343 for _, info := range s.dbSchemas { 344 knownDBNames[info.TableName.Schema] = dbInfo{ 345 fileMeta: info.FileMeta, 346 count: 1, 347 } 348 } 349 for _, info := range s.tableSchemas { 350 dbInfo := knownDBNames[info.TableName.Schema] 351 dbInfo.count++ 352 knownDBNames[info.TableName.Schema] = dbInfo 353 } 354 for _, info := range s.viewSchemas { 355 dbInfo := knownDBNames[info.TableName.Schema] 356 dbInfo.count++ 357 } 358 359 run := func(arr []FileInfo) error { 360 for i, info := range arr { 361 dbName, tableName, err := r.Route(info.TableName.Schema, info.TableName.Name) 362 if err != nil { 363 return errors.Trace(err) 364 } 365 if dbName != info.TableName.Schema { 366 oldInfo := knownDBNames[info.TableName.Schema] 367 oldInfo.count-- 368 knownDBNames[info.TableName.Schema] = oldInfo 369 370 newInfo, ok := knownDBNames[dbName] 371 newInfo.count++ 372 if !ok { 373 newInfo.fileMeta = oldInfo.fileMeta 374 s.dbSchemas = append(s.dbSchemas, FileInfo{ 375 TableName: filter.Table{Schema: dbName}, 376 FileMeta: oldInfo.fileMeta, 377 }) 378 } 379 knownDBNames[dbName] = newInfo 380 } 381 arr[i].TableName = filter.Table{Schema: dbName, Name: tableName} 382 } 383 return nil 384 } 385 386 if err := run(s.tableSchemas); err != nil { 387 return errors.Trace(err) 388 } 389 if err := run(s.viewSchemas); err != nil { 390 return errors.Trace(err) 391 } 392 if err := run(s.tableDatas); err != nil { 393 return errors.Trace(err) 394 } 395 396 // remove all schemas which has been entirely routed away 397 // https://github.com/golang/go/wiki/SliceTricks#filtering-without-allocating 398 remainingSchemas := s.dbSchemas[:0] 399 for _, info := range s.dbSchemas { 400 if knownDBNames[info.TableName.Schema].count > 0 { 401 remainingSchemas = append(remainingSchemas, info) 402 } 403 } 404 s.dbSchemas = remainingSchemas 405 406 return nil 407 } 408 409 func (s *mdLoaderSetup) insertDB(dbName string, path string) (*MDDatabaseMeta, bool) { 410 dbIndex, ok := s.dbIndexMap[dbName] 411 if ok { 412 return s.loader.dbs[dbIndex], true 413 } 414 s.dbIndexMap[dbName] = len(s.loader.dbs) 415 ptr := &MDDatabaseMeta{ 416 Name: dbName, 417 SchemaFile: path, 418 charSet: s.loader.charSet, 419 } 420 s.loader.dbs = append(s.loader.dbs, ptr) 421 return ptr, false 422 } 423 424 func (s *mdLoaderSetup) insertTable(fileInfo FileInfo) (*MDTableMeta, bool, bool) { 425 dbMeta, dbExists := s.insertDB(fileInfo.TableName.Schema, "") 426 tableIndex, ok := s.tableIndexMap[fileInfo.TableName] 427 if ok { 428 return dbMeta.Tables[tableIndex], dbExists, true 429 } 430 s.tableIndexMap[fileInfo.TableName] = len(dbMeta.Tables) 431 ptr := &MDTableMeta{ 432 DB: fileInfo.TableName.Schema, 433 Name: fileInfo.TableName.Name, 434 SchemaFile: fileInfo, 435 DataFiles: make([]FileInfo, 0, 16), 436 charSet: s.loader.charSet, 437 IndexRatio: 0.0, 438 IsRowOrdered: true, 439 } 440 dbMeta.Tables = append(dbMeta.Tables, ptr) 441 return ptr, dbExists, false 442 } 443 444 func (s *mdLoaderSetup) insertView(fileInfo FileInfo) (bool, bool) { 445 dbMeta, dbExists := s.insertDB(fileInfo.TableName.Schema, "") 446 _, ok := s.tableIndexMap[fileInfo.TableName] 447 if ok { 448 meta := &MDTableMeta{ 449 DB: fileInfo.TableName.Schema, 450 Name: fileInfo.TableName.Name, 451 SchemaFile: fileInfo, 452 charSet: s.loader.charSet, 453 IndexRatio: 0.0, 454 IsRowOrdered: true, 455 } 456 dbMeta.Views = append(dbMeta.Views, meta) 457 } 458 return dbExists, ok 459 } 460 461 func (l *MDLoader) GetDatabases() []*MDDatabaseMeta { 462 return l.dbs 463 } 464 465 func (l *MDLoader) GetStore() storage.ExternalStorage { 466 return l.store 467 }