github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/loader.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump 15 16 import ( 17 "context" 18 "path/filepath" 19 "sort" 20 21 "github.com/pingcap/br/pkg/storage" 22 "github.com/pingcap/errors" 23 filter "github.com/pingcap/tidb-tools/pkg/table-filter" 24 router "github.com/pingcap/tidb-tools/pkg/table-router" 25 "go.uber.org/zap" 26 27 "github.com/pingcap/tidb-lightning/lightning/config" 28 "github.com/pingcap/tidb-lightning/lightning/log" 29 ) 30 31 type MDDatabaseMeta struct { 32 Name string 33 SchemaFile string 34 Tables []*MDTableMeta 35 Views []*MDTableMeta 36 charSet string 37 } 38 39 type MDTableMeta struct { 40 DB string 41 Name string 42 SchemaFile FileInfo 43 DataFiles []FileInfo 44 charSet string 45 TotalSize int64 46 } 47 48 type SourceFileMeta struct { 49 Path string 50 Type SourceType 51 Compression Compression 52 SortKey string 53 FileSize int64 54 } 55 56 func (m *MDTableMeta) GetSchema(ctx context.Context, store storage.ExternalStorage) string { 57 schema, err := ExportStatement(ctx, store, m.SchemaFile, m.charSet) 58 if err != nil { 59 log.L().Error("failed to extract table schema", 60 zap.String("Path", m.SchemaFile.FileMeta.Path), 61 log.ShortError(err), 62 ) 63 return "" 64 } 65 return string(schema) 66 } 67 68 /* 69 Mydumper File Loader 70 */ 71 type MDLoader struct { 72 store storage.ExternalStorage 73 noSchema bool 74 dbs []*MDDatabaseMeta 75 filter filter.Filter 76 router *router.Table 77 fileRouter FileRouter 78 charSet string 79 } 80 81 type mdLoaderSetup struct { 82 loader *MDLoader 83 dbSchemas []FileInfo 84 tableSchemas []FileInfo 85 viewSchemas []FileInfo 86 tableDatas []FileInfo 87 dbIndexMap map[string]int 88 tableIndexMap map[filter.Table]int 89 } 90 91 func NewMyDumpLoader(ctx context.Context, cfg *config.Config) (*MDLoader, error) { 92 u, err := storage.ParseBackend(cfg.Mydumper.SourceDir, nil) 93 if err != nil { 94 return nil, err 95 } 96 s, err := storage.Create(ctx, u, true) 97 if err != nil { 98 return nil, err 99 } 100 101 return NewMyDumpLoaderWithStore(ctx, cfg, s) 102 } 103 104 func NewMyDumpLoaderWithStore(ctx context.Context, cfg *config.Config, store storage.ExternalStorage) (*MDLoader, error) { 105 var r *router.Table 106 var err error 107 108 if len(cfg.Routes) > 0 && len(cfg.Mydumper.FileRouters) > 0 { 109 return nil, errors.New("table route is deprecated, can't config both [routes] and [mydumper.files]") 110 } 111 112 if len(cfg.Routes) > 0 { 113 r, err = router.NewTableRouter(cfg.Mydumper.CaseSensitive, cfg.Routes) 114 if err != nil { 115 return nil, errors.Trace(err) 116 } 117 } 118 119 // use the legacy black-white-list if defined. otherwise use the new filter. 120 var f filter.Filter 121 if cfg.HasLegacyBlackWhiteList() { 122 f, err = filter.ParseMySQLReplicationRules(&cfg.BWList) 123 } else { 124 f, err = filter.Parse(cfg.Mydumper.Filter) 125 } 126 if err != nil { 127 return nil, errors.Annotate(err, "parse filter failed") 128 } 129 if !cfg.Mydumper.CaseSensitive { 130 f = filter.CaseInsensitive(f) 131 } 132 133 fileRouteRules := cfg.Mydumper.FileRouters 134 if cfg.Mydumper.DefaultFileRules { 135 fileRouteRules = append(fileRouteRules, defaultFileRouteRules...) 136 } 137 138 fileRouter, err := NewFileRouter(fileRouteRules) 139 if err != nil { 140 return nil, errors.Annotate(err, "parser file routing rule failed") 141 } 142 143 mdl := &MDLoader{ 144 store: store, 145 noSchema: cfg.Mydumper.NoSchema, 146 filter: f, 147 router: r, 148 charSet: cfg.Mydumper.CharacterSet, 149 fileRouter: fileRouter, 150 } 151 152 setup := mdLoaderSetup{ 153 loader: mdl, 154 dbIndexMap: make(map[string]int), 155 tableIndexMap: make(map[filter.Table]int), 156 } 157 158 if err := setup.setup(ctx, mdl.store); err != nil { 159 return nil, errors.Trace(err) 160 } 161 162 return mdl, nil 163 } 164 165 type fileType int 166 167 const ( 168 fileTypeDatabaseSchema fileType = iota 169 fileTypeTableSchema 170 fileTypeTableData 171 ) 172 173 func (ftype fileType) String() string { 174 switch ftype { 175 case fileTypeDatabaseSchema: 176 return "database schema" 177 case fileTypeTableSchema: 178 return "table schema" 179 case fileTypeTableData: 180 return "table data" 181 default: 182 return "(unknown)" 183 } 184 } 185 186 type FileInfo struct { 187 TableName filter.Table 188 FileMeta SourceFileMeta 189 } 190 191 // setup the `s.loader.dbs` slice by scanning all *.sql files inside `dir`. 192 // 193 // The database and tables are inserted in a consistent order, so creating an 194 // MDLoader twice with the same data source is going to produce the same array, 195 // even after killing Lightning. 196 // 197 // This is achieved by using `filepath.Walk` internally which guarantees the 198 // files are visited in lexicographical order (note that this does not mean the 199 // databases and tables in the end are ordered lexicographically since they may 200 // be stored in different subdirectories). 201 // 202 // Will sort tables by table size, this means that the big table is imported 203 // at the latest, which to avoid large table take a long time to import and block 204 // small table to release index worker. 205 func (s *mdLoaderSetup) setup(ctx context.Context, store storage.ExternalStorage) error { 206 /* 207 Mydumper file names format 208 db —— {db}-schema-create.sql 209 table —— {db}.{table}-schema.sql 210 sql —— {db}.{table}.{part}.sql / {db}.{table}.sql 211 */ 212 if err := s.listFiles(ctx, store); err != nil { 213 return errors.Annotate(err, "list file failed") 214 } 215 if err := s.route(); err != nil { 216 return errors.Trace(err) 217 } 218 219 if !s.loader.noSchema { 220 // setup database schema 221 if len(s.dbSchemas) == 0 { 222 return errors.New("no schema create sql files found. Please either set `mydumper.no-schema` to true or add schema sql file for each database.") 223 } 224 for _, fileInfo := range s.dbSchemas { 225 if _, dbExists := s.insertDB(fileInfo.TableName.Schema, fileInfo.FileMeta.Path); dbExists && s.loader.router == nil { 226 return errors.Errorf("invalid database schema file, duplicated item - %s", fileInfo.FileMeta.Path) 227 } 228 } 229 230 // setup table schema 231 for _, fileInfo := range s.tableSchemas { 232 _, dbExists, tableExists := s.insertTable(fileInfo) 233 if !dbExists { 234 return errors.Errorf("invalid table schema file, cannot find db '%s' - %s", fileInfo.TableName.Schema, fileInfo.FileMeta.Path) 235 } else if tableExists && s.loader.router == nil { 236 return errors.Errorf("invalid table schema file, duplicated item - %s", fileInfo.FileMeta.Path) 237 } 238 } 239 240 // setup view schema 241 for _, fileInfo := range s.viewSchemas { 242 dbExists, tableExists := s.insertView(fileInfo) 243 if !dbExists { 244 return errors.Errorf("invalid table schema file, cannot find db '%s' - %s", fileInfo.TableName.Schema, fileInfo.FileMeta.Path) 245 } else if !tableExists { 246 // remove the last `-view.sql` from path as the relate table schema file path 247 return errors.Errorf("invalid view schema file, miss host table schema for view '%s'", fileInfo.TableName.Name) 248 } 249 } 250 } 251 252 // Sql file for restore data 253 for _, fileInfo := range s.tableDatas { 254 // set a dummy `FileInfo` here without file meta because we needn't restore the table schema 255 tableMeta, dbExists, tableExists := s.insertTable(FileInfo{TableName: fileInfo.TableName}) 256 if !s.loader.noSchema { 257 if !dbExists { 258 return errors.Errorf("invalid data file, miss host db '%s' - %s", fileInfo.TableName.Schema, fileInfo.FileMeta.Path) 259 } else if !tableExists { 260 return errors.Errorf("invalid data file, miss host table '%s' - %s", fileInfo.TableName.Name, fileInfo.FileMeta.Path) 261 } 262 } 263 tableMeta.DataFiles = append(tableMeta.DataFiles, fileInfo) 264 tableMeta.TotalSize += fileInfo.FileMeta.FileSize 265 } 266 267 for _, dbMeta := range s.loader.dbs { 268 // Put the small table in the front of the slice which can avoid large table 269 // take a long time to import and block small table to release index worker. 270 sort.SliceStable(dbMeta.Tables, func(i, j int) bool { 271 return dbMeta.Tables[i].TotalSize < dbMeta.Tables[j].TotalSize 272 }) 273 274 // sort each table source files by sort-key 275 for _, tbMeta := range dbMeta.Tables { 276 dataFiles := tbMeta.DataFiles 277 sort.SliceStable(dataFiles, func(i, j int) bool { 278 return dataFiles[i].FileMeta.SortKey < dataFiles[j].FileMeta.SortKey 279 }) 280 } 281 } 282 283 return nil 284 } 285 286 func (s *mdLoaderSetup) listFiles(ctx context.Context, store storage.ExternalStorage) error { 287 // `filepath.Walk` yields the paths in a deterministic (lexicographical) order, 288 // meaning the file and chunk orders will be the same everytime it is called 289 // (as long as the source is immutable). 290 err := store.WalkDir(ctx, &storage.WalkOption{}, func(path string, size int64) error { 291 logger := log.With(zap.String("path", path)) 292 293 res, err := s.loader.fileRouter.Route(filepath.ToSlash(path)) 294 if err != nil { 295 return errors.Annotatef(err, "apply file routing on file '%s' failed", path) 296 } 297 if res == nil { 298 logger.Debug("[loader] file is filtered by file router") 299 return nil 300 } 301 302 info := FileInfo{ 303 TableName: filter.Table{Schema: res.Schema, Name: res.Name}, 304 FileMeta: SourceFileMeta{Path: path, Type: res.Type, Compression: res.Compression, SortKey: res.Key, FileSize: size}, 305 } 306 307 if s.loader.shouldSkip(&info.TableName) { 308 logger.Debug("[filter] ignoring table file") 309 310 return nil 311 } 312 313 switch res.Type { 314 case SourceTypeSchemaSchema: 315 s.dbSchemas = append(s.dbSchemas, info) 316 case SourceTypeTableSchema: 317 s.tableSchemas = append(s.tableSchemas, info) 318 case SourceTypeViewSchema: 319 s.viewSchemas = append(s.viewSchemas, info) 320 case SourceTypeSQL, SourceTypeCSV, SourceTypeParquet: 321 s.tableDatas = append(s.tableDatas, info) 322 } 323 324 logger.Debug("file route result", zap.String("schema", res.Schema), 325 zap.String("table", res.Name), zap.Stringer("type", res.Type)) 326 327 return nil 328 }) 329 330 return errors.Trace(err) 331 } 332 333 func (l *MDLoader) shouldSkip(table *filter.Table) bool { 334 if len(table.Name) == 0 { 335 return !l.filter.MatchSchema(table.Schema) 336 } 337 return !l.filter.MatchTable(table.Schema, table.Name) 338 } 339 340 func (s *mdLoaderSetup) route() error { 341 r := s.loader.router 342 if r == nil { 343 return nil 344 } 345 346 type dbInfo struct { 347 fileMeta SourceFileMeta 348 count int 349 } 350 351 knownDBNames := make(map[string]dbInfo) 352 for _, info := range s.dbSchemas { 353 knownDBNames[info.TableName.Schema] = dbInfo{ 354 fileMeta: info.FileMeta, 355 count: 1, 356 } 357 } 358 for _, info := range s.tableSchemas { 359 dbInfo := knownDBNames[info.TableName.Schema] 360 dbInfo.count++ 361 knownDBNames[info.TableName.Schema] = dbInfo 362 } 363 for _, info := range s.viewSchemas { 364 dbInfo := knownDBNames[info.TableName.Schema] 365 dbInfo.count++ 366 } 367 368 run := func(arr []FileInfo) error { 369 for i, info := range arr { 370 dbName, tableName, err := r.Route(info.TableName.Schema, info.TableName.Name) 371 if err != nil { 372 return errors.Trace(err) 373 } 374 if dbName != info.TableName.Schema { 375 oldInfo := knownDBNames[info.TableName.Schema] 376 oldInfo.count-- 377 knownDBNames[info.TableName.Schema] = oldInfo 378 379 newInfo, ok := knownDBNames[dbName] 380 newInfo.count++ 381 if !ok { 382 newInfo.fileMeta = oldInfo.fileMeta 383 s.dbSchemas = append(s.dbSchemas, FileInfo{ 384 TableName: filter.Table{Schema: dbName}, 385 FileMeta: oldInfo.fileMeta, 386 }) 387 } 388 knownDBNames[dbName] = newInfo 389 } 390 arr[i].TableName = filter.Table{Schema: dbName, Name: tableName} 391 } 392 return nil 393 } 394 395 if err := run(s.tableSchemas); err != nil { 396 return errors.Trace(err) 397 } 398 if err := run(s.viewSchemas); err != nil { 399 return errors.Trace(err) 400 } 401 if err := run(s.tableDatas); err != nil { 402 return errors.Trace(err) 403 } 404 405 // remove all schemas which has been entirely routed away 406 // https://github.com/golang/go/wiki/SliceTricks#filtering-without-allocating 407 remainingSchemas := s.dbSchemas[:0] 408 for _, info := range s.dbSchemas { 409 if knownDBNames[info.TableName.Schema].count > 0 { 410 remainingSchemas = append(remainingSchemas, info) 411 } 412 } 413 s.dbSchemas = remainingSchemas 414 415 return nil 416 } 417 418 func (s *mdLoaderSetup) insertDB(dbName string, path string) (*MDDatabaseMeta, bool) { 419 dbIndex, ok := s.dbIndexMap[dbName] 420 if ok { 421 return s.loader.dbs[dbIndex], true 422 } else { 423 s.dbIndexMap[dbName] = len(s.loader.dbs) 424 ptr := &MDDatabaseMeta{ 425 Name: dbName, 426 SchemaFile: path, 427 charSet: s.loader.charSet, 428 } 429 s.loader.dbs = append(s.loader.dbs, ptr) 430 return ptr, false 431 } 432 } 433 434 func (s *mdLoaderSetup) insertTable(fileInfo FileInfo) (*MDTableMeta, bool, bool) { 435 dbMeta, dbExists := s.insertDB(fileInfo.TableName.Schema, "") 436 tableIndex, ok := s.tableIndexMap[fileInfo.TableName] 437 if ok { 438 return dbMeta.Tables[tableIndex], dbExists, true 439 } else { 440 s.tableIndexMap[fileInfo.TableName] = len(dbMeta.Tables) 441 ptr := &MDTableMeta{ 442 DB: fileInfo.TableName.Schema, 443 Name: fileInfo.TableName.Name, 444 SchemaFile: fileInfo, 445 DataFiles: make([]FileInfo, 0, 16), 446 charSet: s.loader.charSet, 447 } 448 dbMeta.Tables = append(dbMeta.Tables, ptr) 449 return ptr, dbExists, false 450 } 451 } 452 453 func (s *mdLoaderSetup) insertView(fileInfo FileInfo) (bool, bool) { 454 dbMeta, dbExists := s.insertDB(fileInfo.TableName.Schema, "") 455 _, ok := s.tableIndexMap[fileInfo.TableName] 456 if ok { 457 meta := &MDTableMeta{ 458 DB: fileInfo.TableName.Schema, 459 Name: fileInfo.TableName.Name, 460 SchemaFile: fileInfo, 461 charSet: s.loader.charSet, 462 } 463 dbMeta.Views = append(dbMeta.Views, meta) 464 } 465 return dbExists, ok 466 } 467 468 func (l *MDLoader) GetDatabases() []*MDDatabaseMeta { 469 return l.dbs 470 } 471 472 func (l *MDLoader) GetStore() storage.ExternalStorage { 473 return l.store 474 }