github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/loader.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package mydump
    15  
    16  import (
    17  	"context"
    18  	"path/filepath"
    19  	"sort"
    20  
    21  	"github.com/pingcap/br/pkg/storage"
    22  	"github.com/pingcap/errors"
    23  	filter "github.com/pingcap/tidb-tools/pkg/table-filter"
    24  	router "github.com/pingcap/tidb-tools/pkg/table-router"
    25  	"go.uber.org/zap"
    26  
    27  	"github.com/pingcap/tidb-lightning/lightning/config"
    28  	"github.com/pingcap/tidb-lightning/lightning/log"
    29  )
    30  
    31  type MDDatabaseMeta struct {
    32  	Name       string
    33  	SchemaFile string
    34  	Tables     []*MDTableMeta
    35  	Views      []*MDTableMeta
    36  	charSet    string
    37  }
    38  
    39  type MDTableMeta struct {
    40  	DB         string
    41  	Name       string
    42  	SchemaFile FileInfo
    43  	DataFiles  []FileInfo
    44  	charSet    string
    45  	TotalSize  int64
    46  }
    47  
    48  type SourceFileMeta struct {
    49  	Path        string
    50  	Type        SourceType
    51  	Compression Compression
    52  	SortKey     string
    53  	FileSize    int64
    54  }
    55  
    56  func (m *MDTableMeta) GetSchema(ctx context.Context, store storage.ExternalStorage) string {
    57  	schema, err := ExportStatement(ctx, store, m.SchemaFile, m.charSet)
    58  	if err != nil {
    59  		log.L().Error("failed to extract table schema",
    60  			zap.String("Path", m.SchemaFile.FileMeta.Path),
    61  			log.ShortError(err),
    62  		)
    63  		return ""
    64  	}
    65  	return string(schema)
    66  }
    67  
    68  /*
    69  	Mydumper File Loader
    70  */
    71  type MDLoader struct {
    72  	store      storage.ExternalStorage
    73  	noSchema   bool
    74  	dbs        []*MDDatabaseMeta
    75  	filter     filter.Filter
    76  	router     *router.Table
    77  	fileRouter FileRouter
    78  	charSet    string
    79  }
    80  
    81  type mdLoaderSetup struct {
    82  	loader        *MDLoader
    83  	dbSchemas     []FileInfo
    84  	tableSchemas  []FileInfo
    85  	viewSchemas   []FileInfo
    86  	tableDatas    []FileInfo
    87  	dbIndexMap    map[string]int
    88  	tableIndexMap map[filter.Table]int
    89  }
    90  
    91  func NewMyDumpLoader(ctx context.Context, cfg *config.Config) (*MDLoader, error) {
    92  	u, err := storage.ParseBackend(cfg.Mydumper.SourceDir, nil)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  	s, err := storage.Create(ctx, u, true)
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  
   101  	return NewMyDumpLoaderWithStore(ctx, cfg, s)
   102  }
   103  
   104  func NewMyDumpLoaderWithStore(ctx context.Context, cfg *config.Config, store storage.ExternalStorage) (*MDLoader, error) {
   105  	var r *router.Table
   106  	var err error
   107  
   108  	if len(cfg.Routes) > 0 && len(cfg.Mydumper.FileRouters) > 0 {
   109  		return nil, errors.New("table route is deprecated, can't config both [routes] and [mydumper.files]")
   110  	}
   111  
   112  	if len(cfg.Routes) > 0 {
   113  		r, err = router.NewTableRouter(cfg.Mydumper.CaseSensitive, cfg.Routes)
   114  		if err != nil {
   115  			return nil, errors.Trace(err)
   116  		}
   117  	}
   118  
   119  	// use the legacy black-white-list if defined. otherwise use the new filter.
   120  	var f filter.Filter
   121  	if cfg.HasLegacyBlackWhiteList() {
   122  		f, err = filter.ParseMySQLReplicationRules(&cfg.BWList)
   123  	} else {
   124  		f, err = filter.Parse(cfg.Mydumper.Filter)
   125  	}
   126  	if err != nil {
   127  		return nil, errors.Annotate(err, "parse filter failed")
   128  	}
   129  	if !cfg.Mydumper.CaseSensitive {
   130  		f = filter.CaseInsensitive(f)
   131  	}
   132  
   133  	fileRouteRules := cfg.Mydumper.FileRouters
   134  	if cfg.Mydumper.DefaultFileRules {
   135  		fileRouteRules = append(fileRouteRules, defaultFileRouteRules...)
   136  	}
   137  
   138  	fileRouter, err := NewFileRouter(fileRouteRules)
   139  	if err != nil {
   140  		return nil, errors.Annotate(err, "parser file routing rule failed")
   141  	}
   142  
   143  	mdl := &MDLoader{
   144  		store:      store,
   145  		noSchema:   cfg.Mydumper.NoSchema,
   146  		filter:     f,
   147  		router:     r,
   148  		charSet:    cfg.Mydumper.CharacterSet,
   149  		fileRouter: fileRouter,
   150  	}
   151  
   152  	setup := mdLoaderSetup{
   153  		loader:        mdl,
   154  		dbIndexMap:    make(map[string]int),
   155  		tableIndexMap: make(map[filter.Table]int),
   156  	}
   157  
   158  	if err := setup.setup(ctx, mdl.store); err != nil {
   159  		return nil, errors.Trace(err)
   160  	}
   161  
   162  	return mdl, nil
   163  }
   164  
   165  type fileType int
   166  
   167  const (
   168  	fileTypeDatabaseSchema fileType = iota
   169  	fileTypeTableSchema
   170  	fileTypeTableData
   171  )
   172  
   173  func (ftype fileType) String() string {
   174  	switch ftype {
   175  	case fileTypeDatabaseSchema:
   176  		return "database schema"
   177  	case fileTypeTableSchema:
   178  		return "table schema"
   179  	case fileTypeTableData:
   180  		return "table data"
   181  	default:
   182  		return "(unknown)"
   183  	}
   184  }
   185  
   186  type FileInfo struct {
   187  	TableName filter.Table
   188  	FileMeta  SourceFileMeta
   189  }
   190  
   191  // setup the `s.loader.dbs` slice by scanning all *.sql files inside `dir`.
   192  //
   193  // The database and tables are inserted in a consistent order, so creating an
   194  // MDLoader twice with the same data source is going to produce the same array,
   195  // even after killing Lightning.
   196  //
   197  // This is achieved by using `filepath.Walk` internally which guarantees the
   198  // files are visited in lexicographical order (note that this does not mean the
   199  // databases and tables in the end are ordered lexicographically since they may
   200  // be stored in different subdirectories).
   201  //
   202  // Will sort tables by table size, this means that the big table is imported
   203  // at the latest, which to avoid large table take a long time to import and block
   204  // small table to release index worker.
   205  func (s *mdLoaderSetup) setup(ctx context.Context, store storage.ExternalStorage) error {
   206  	/*
   207  		Mydumper file names format
   208  			db    —— {db}-schema-create.sql
   209  			table —— {db}.{table}-schema.sql
   210  			sql   —— {db}.{table}.{part}.sql / {db}.{table}.sql
   211  	*/
   212  	if err := s.listFiles(ctx, store); err != nil {
   213  		return errors.Annotate(err, "list file failed")
   214  	}
   215  	if err := s.route(); err != nil {
   216  		return errors.Trace(err)
   217  	}
   218  
   219  	if !s.loader.noSchema {
   220  		// setup database schema
   221  		if len(s.dbSchemas) == 0 {
   222  			return errors.New("no schema create sql files found. Please either set `mydumper.no-schema` to true or add schema sql file for each database.")
   223  		}
   224  		for _, fileInfo := range s.dbSchemas {
   225  			if _, dbExists := s.insertDB(fileInfo.TableName.Schema, fileInfo.FileMeta.Path); dbExists && s.loader.router == nil {
   226  				return errors.Errorf("invalid database schema file, duplicated item - %s", fileInfo.FileMeta.Path)
   227  			}
   228  		}
   229  
   230  		// setup table schema
   231  		for _, fileInfo := range s.tableSchemas {
   232  			_, dbExists, tableExists := s.insertTable(fileInfo)
   233  			if !dbExists {
   234  				return errors.Errorf("invalid table schema file, cannot find db '%s' - %s", fileInfo.TableName.Schema, fileInfo.FileMeta.Path)
   235  			} else if tableExists && s.loader.router == nil {
   236  				return errors.Errorf("invalid table schema file, duplicated item - %s", fileInfo.FileMeta.Path)
   237  			}
   238  		}
   239  
   240  		// setup view schema
   241  		for _, fileInfo := range s.viewSchemas {
   242  			dbExists, tableExists := s.insertView(fileInfo)
   243  			if !dbExists {
   244  				return errors.Errorf("invalid table schema file, cannot find db '%s' - %s", fileInfo.TableName.Schema, fileInfo.FileMeta.Path)
   245  			} else if !tableExists {
   246  				// remove the last `-view.sql` from path as the relate table schema file path
   247  				return errors.Errorf("invalid view schema file, miss host table schema for view '%s'", fileInfo.TableName.Name)
   248  			}
   249  		}
   250  	}
   251  
   252  	// Sql file for restore data
   253  	for _, fileInfo := range s.tableDatas {
   254  		// set a dummy `FileInfo` here without file meta because we needn't restore the table schema
   255  		tableMeta, dbExists, tableExists := s.insertTable(FileInfo{TableName: fileInfo.TableName})
   256  		if !s.loader.noSchema {
   257  			if !dbExists {
   258  				return errors.Errorf("invalid data file, miss host db '%s' - %s", fileInfo.TableName.Schema, fileInfo.FileMeta.Path)
   259  			} else if !tableExists {
   260  				return errors.Errorf("invalid data file, miss host table '%s' - %s", fileInfo.TableName.Name, fileInfo.FileMeta.Path)
   261  			}
   262  		}
   263  		tableMeta.DataFiles = append(tableMeta.DataFiles, fileInfo)
   264  		tableMeta.TotalSize += fileInfo.FileMeta.FileSize
   265  	}
   266  
   267  	for _, dbMeta := range s.loader.dbs {
   268  		// Put the small table in the front of the slice which can avoid large table
   269  		// take a long time to import and block small table to release index worker.
   270  		sort.SliceStable(dbMeta.Tables, func(i, j int) bool {
   271  			return dbMeta.Tables[i].TotalSize < dbMeta.Tables[j].TotalSize
   272  		})
   273  
   274  		// sort each table source files by sort-key
   275  		for _, tbMeta := range dbMeta.Tables {
   276  			dataFiles := tbMeta.DataFiles
   277  			sort.SliceStable(dataFiles, func(i, j int) bool {
   278  				return dataFiles[i].FileMeta.SortKey < dataFiles[j].FileMeta.SortKey
   279  			})
   280  		}
   281  	}
   282  
   283  	return nil
   284  }
   285  
   286  func (s *mdLoaderSetup) listFiles(ctx context.Context, store storage.ExternalStorage) error {
   287  	// `filepath.Walk` yields the paths in a deterministic (lexicographical) order,
   288  	// meaning the file and chunk orders will be the same everytime it is called
   289  	// (as long as the source is immutable).
   290  	err := store.WalkDir(ctx, &storage.WalkOption{}, func(path string, size int64) error {
   291  		logger := log.With(zap.String("path", path))
   292  
   293  		res, err := s.loader.fileRouter.Route(filepath.ToSlash(path))
   294  		if err != nil {
   295  			return errors.Annotatef(err, "apply file routing on file '%s' failed", path)
   296  		}
   297  		if res == nil {
   298  			logger.Debug("[loader] file is filtered by file router")
   299  			return nil
   300  		}
   301  
   302  		info := FileInfo{
   303  			TableName: filter.Table{Schema: res.Schema, Name: res.Name},
   304  			FileMeta:  SourceFileMeta{Path: path, Type: res.Type, Compression: res.Compression, SortKey: res.Key, FileSize: size},
   305  		}
   306  
   307  		if s.loader.shouldSkip(&info.TableName) {
   308  			logger.Debug("[filter] ignoring table file")
   309  
   310  			return nil
   311  		}
   312  
   313  		switch res.Type {
   314  		case SourceTypeSchemaSchema:
   315  			s.dbSchemas = append(s.dbSchemas, info)
   316  		case SourceTypeTableSchema:
   317  			s.tableSchemas = append(s.tableSchemas, info)
   318  		case SourceTypeViewSchema:
   319  			s.viewSchemas = append(s.viewSchemas, info)
   320  		case SourceTypeSQL, SourceTypeCSV, SourceTypeParquet:
   321  			s.tableDatas = append(s.tableDatas, info)
   322  		}
   323  
   324  		logger.Debug("file route result", zap.String("schema", res.Schema),
   325  			zap.String("table", res.Name), zap.Stringer("type", res.Type))
   326  
   327  		return nil
   328  	})
   329  
   330  	return errors.Trace(err)
   331  }
   332  
   333  func (l *MDLoader) shouldSkip(table *filter.Table) bool {
   334  	if len(table.Name) == 0 {
   335  		return !l.filter.MatchSchema(table.Schema)
   336  	}
   337  	return !l.filter.MatchTable(table.Schema, table.Name)
   338  }
   339  
   340  func (s *mdLoaderSetup) route() error {
   341  	r := s.loader.router
   342  	if r == nil {
   343  		return nil
   344  	}
   345  
   346  	type dbInfo struct {
   347  		fileMeta SourceFileMeta
   348  		count    int
   349  	}
   350  
   351  	knownDBNames := make(map[string]dbInfo)
   352  	for _, info := range s.dbSchemas {
   353  		knownDBNames[info.TableName.Schema] = dbInfo{
   354  			fileMeta: info.FileMeta,
   355  			count:    1,
   356  		}
   357  	}
   358  	for _, info := range s.tableSchemas {
   359  		dbInfo := knownDBNames[info.TableName.Schema]
   360  		dbInfo.count++
   361  		knownDBNames[info.TableName.Schema] = dbInfo
   362  	}
   363  	for _, info := range s.viewSchemas {
   364  		dbInfo := knownDBNames[info.TableName.Schema]
   365  		dbInfo.count++
   366  	}
   367  
   368  	run := func(arr []FileInfo) error {
   369  		for i, info := range arr {
   370  			dbName, tableName, err := r.Route(info.TableName.Schema, info.TableName.Name)
   371  			if err != nil {
   372  				return errors.Trace(err)
   373  			}
   374  			if dbName != info.TableName.Schema {
   375  				oldInfo := knownDBNames[info.TableName.Schema]
   376  				oldInfo.count--
   377  				knownDBNames[info.TableName.Schema] = oldInfo
   378  
   379  				newInfo, ok := knownDBNames[dbName]
   380  				newInfo.count++
   381  				if !ok {
   382  					newInfo.fileMeta = oldInfo.fileMeta
   383  					s.dbSchemas = append(s.dbSchemas, FileInfo{
   384  						TableName: filter.Table{Schema: dbName},
   385  						FileMeta:  oldInfo.fileMeta,
   386  					})
   387  				}
   388  				knownDBNames[dbName] = newInfo
   389  			}
   390  			arr[i].TableName = filter.Table{Schema: dbName, Name: tableName}
   391  		}
   392  		return nil
   393  	}
   394  
   395  	if err := run(s.tableSchemas); err != nil {
   396  		return errors.Trace(err)
   397  	}
   398  	if err := run(s.viewSchemas); err != nil {
   399  		return errors.Trace(err)
   400  	}
   401  	if err := run(s.tableDatas); err != nil {
   402  		return errors.Trace(err)
   403  	}
   404  
   405  	// remove all schemas which has been entirely routed away
   406  	// https://github.com/golang/go/wiki/SliceTricks#filtering-without-allocating
   407  	remainingSchemas := s.dbSchemas[:0]
   408  	for _, info := range s.dbSchemas {
   409  		if knownDBNames[info.TableName.Schema].count > 0 {
   410  			remainingSchemas = append(remainingSchemas, info)
   411  		}
   412  	}
   413  	s.dbSchemas = remainingSchemas
   414  
   415  	return nil
   416  }
   417  
   418  func (s *mdLoaderSetup) insertDB(dbName string, path string) (*MDDatabaseMeta, bool) {
   419  	dbIndex, ok := s.dbIndexMap[dbName]
   420  	if ok {
   421  		return s.loader.dbs[dbIndex], true
   422  	} else {
   423  		s.dbIndexMap[dbName] = len(s.loader.dbs)
   424  		ptr := &MDDatabaseMeta{
   425  			Name:       dbName,
   426  			SchemaFile: path,
   427  			charSet:    s.loader.charSet,
   428  		}
   429  		s.loader.dbs = append(s.loader.dbs, ptr)
   430  		return ptr, false
   431  	}
   432  }
   433  
   434  func (s *mdLoaderSetup) insertTable(fileInfo FileInfo) (*MDTableMeta, bool, bool) {
   435  	dbMeta, dbExists := s.insertDB(fileInfo.TableName.Schema, "")
   436  	tableIndex, ok := s.tableIndexMap[fileInfo.TableName]
   437  	if ok {
   438  		return dbMeta.Tables[tableIndex], dbExists, true
   439  	} else {
   440  		s.tableIndexMap[fileInfo.TableName] = len(dbMeta.Tables)
   441  		ptr := &MDTableMeta{
   442  			DB:         fileInfo.TableName.Schema,
   443  			Name:       fileInfo.TableName.Name,
   444  			SchemaFile: fileInfo,
   445  			DataFiles:  make([]FileInfo, 0, 16),
   446  			charSet:    s.loader.charSet,
   447  		}
   448  		dbMeta.Tables = append(dbMeta.Tables, ptr)
   449  		return ptr, dbExists, false
   450  	}
   451  }
   452  
   453  func (s *mdLoaderSetup) insertView(fileInfo FileInfo) (bool, bool) {
   454  	dbMeta, dbExists := s.insertDB(fileInfo.TableName.Schema, "")
   455  	_, ok := s.tableIndexMap[fileInfo.TableName]
   456  	if ok {
   457  		meta := &MDTableMeta{
   458  			DB:         fileInfo.TableName.Schema,
   459  			Name:       fileInfo.TableName.Name,
   460  			SchemaFile: fileInfo,
   461  			charSet:    s.loader.charSet,
   462  		}
   463  		dbMeta.Views = append(dbMeta.Views, meta)
   464  	}
   465  	return dbExists, ok
   466  }
   467  
   468  func (l *MDLoader) GetDatabases() []*MDDatabaseMeta {
   469  	return l.dbs
   470  }
   471  
   472  func (l *MDLoader) GetStore() storage.ExternalStorage {
   473  	return l.store
   474  }