github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/restore/check_info.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package restore
    15  
    16  import (
    17  	"bytes"
    18  	"context"
    19  	"fmt"
    20  	"io"
    21  	"path/filepath"
    22  	"reflect"
    23  	"strings"
    24  
    25  	"github.com/docker/go-units"
    26  	"github.com/pingcap/errors"
    27  	"github.com/pingcap/failpoint"
    28  	"github.com/pingcap/parser/model"
    29  	"github.com/pingcap/parser/mysql"
    30  	"github.com/pingcap/tidb/table/tables"
    31  	"github.com/tikv/pd/pkg/typeutil"
    32  	"github.com/tikv/pd/server/api"
    33  	pdconfig "github.com/tikv/pd/server/config"
    34  	"go.uber.org/zap"
    35  
    36  	"github.com/pingcap/br/pkg/lightning/backend"
    37  	"github.com/pingcap/br/pkg/lightning/backend/kv"
    38  	"github.com/pingcap/br/pkg/lightning/checkpoints"
    39  	"github.com/pingcap/br/pkg/lightning/common"
    40  	"github.com/pingcap/br/pkg/lightning/config"
    41  	"github.com/pingcap/br/pkg/lightning/log"
    42  	"github.com/pingcap/br/pkg/lightning/mydump"
    43  	"github.com/pingcap/br/pkg/lightning/verification"
    44  	"github.com/pingcap/br/pkg/storage"
    45  )
    46  
    47  const (
    48  	pdWriteFlow = "/pd/api/v1/regions/writeflow"
    49  	pdReadFlow  = "/pd/api/v1/regions/readflow"
    50  
    51  	// OnlineBytesLimitation/OnlineKeysLimitation is the statistics of
    52  	// Bytes/Keys used per region from pdWriteFlow/pdReadFlow
    53  	// this determines whether the cluster has some region that have other loads
    54  	// and might influence the import task in the future.
    55  	OnlineBytesLimitation = 10 * units.MiB
    56  	OnlineKeysLimitation  = 5000
    57  
    58  	pdStores    = "/pd/api/v1/stores"
    59  	pdReplicate = "/pd/api/v1/config/replicate"
    60  
    61  	defaultCSVSize    = 10 * units.GiB
    62  	maxSampleDataSize = 10 * 1024 * 1024
    63  	maxSampleRowCount = 10 * 1024
    64  )
    65  
    66  func (rc *Controller) isSourceInLocal() bool {
    67  	return strings.HasPrefix(rc.store.URI(), storage.LocalURIPrefix)
    68  }
    69  
    70  func (rc *Controller) getReplicaCount(ctx context.Context) (uint64, error) {
    71  	result := &pdconfig.ReplicationConfig{}
    72  	err := rc.tls.WithHost(rc.cfg.TiDB.PdAddr).GetJSON(ctx, pdReplicate, &result)
    73  	if err != nil {
    74  		return 0, errors.Trace(err)
    75  	}
    76  	return result.MaxReplicas, nil
    77  }
    78  
    79  // ClusterResource check cluster has enough resource to import data. this test can by skipped.
    80  func (rc *Controller) ClusterResource(ctx context.Context, localSource int64) error {
    81  	passed := true
    82  	message := "Cluster resources are rich for this import task"
    83  	defer func() {
    84  		rc.checkTemplate.Collect(Critical, passed, message)
    85  	}()
    86  
    87  	result := &api.StoresInfo{}
    88  	err := rc.tls.WithHost(rc.cfg.TiDB.PdAddr).GetJSON(ctx, pdStores, result)
    89  	if err != nil {
    90  		return errors.Trace(err)
    91  	}
    92  	totalCapacity := typeutil.ByteSize(0)
    93  	for _, store := range result.Stores {
    94  		totalCapacity += store.Status.Capacity
    95  	}
    96  	clusterSource := localSource
    97  	if rc.taskMgr != nil {
    98  		clusterSource, err = rc.taskMgr.CheckClusterSource(ctx)
    99  		if err != nil {
   100  			return errors.Trace(err)
   101  		}
   102  	}
   103  
   104  	replicaCount, err := rc.getReplicaCount(ctx)
   105  	if err != nil {
   106  		return errors.Trace(err)
   107  	}
   108  	estimateSize := uint64(clusterSource) * replicaCount
   109  	if typeutil.ByteSize(estimateSize) > totalCapacity {
   110  		passed = false
   111  		message = fmt.Sprintf("Cluster doesn't have enough space, capacity is %s, but we need %s",
   112  			units.BytesSize(float64(totalCapacity)), units.BytesSize(float64(estimateSize)))
   113  	} else {
   114  		message = fmt.Sprintf("Cluster capacity is rich, capacity is %s, we need %s",
   115  			units.BytesSize(float64(totalCapacity)), units.BytesSize(float64(estimateSize)))
   116  	}
   117  	return nil
   118  }
   119  
   120  // ClusterIsAvailable check cluster is available to import data. this test can be skipped.
   121  func (rc *Controller) ClusterIsAvailable(ctx context.Context) error {
   122  	passed := true
   123  	message := "Cluster is available"
   124  	defer func() {
   125  		rc.checkTemplate.Collect(Critical, passed, message)
   126  	}()
   127  	// skip requirement check if explicitly turned off
   128  	if !rc.cfg.App.CheckRequirements {
   129  		message = "Cluster's available check is skipped by user requirement"
   130  		return nil
   131  	}
   132  	checkCtx := &backend.CheckCtx{
   133  		DBMetas: rc.dbMetas,
   134  	}
   135  	if err := rc.backend.CheckRequirements(ctx, checkCtx); err != nil {
   136  		passed = false
   137  		message = fmt.Sprintf("cluster available check failed: %s", err.Error())
   138  	}
   139  	return nil
   140  }
   141  
   142  // StoragePermission checks whether Lightning has enough permission to storage.
   143  // this test cannot be skipped.
   144  func (rc *Controller) StoragePermission(ctx context.Context) error {
   145  	passed := true
   146  	message := "Lightning has the correct storage permission"
   147  	defer func() {
   148  		rc.checkTemplate.Collect(Critical, passed, message)
   149  	}()
   150  
   151  	u, err := storage.ParseBackend(rc.cfg.Mydumper.SourceDir, nil)
   152  	if err != nil {
   153  		return errors.Annotate(err, "parse backend failed")
   154  	}
   155  	_, err = storage.New(ctx, u, &storage.ExternalStorageOptions{
   156  		CheckPermissions: []storage.Permission{
   157  			storage.ListObjects,
   158  			storage.GetObject,
   159  		},
   160  	})
   161  	if err != nil {
   162  		passed = false
   163  		message = err.Error()
   164  	}
   165  	return nil
   166  }
   167  
   168  // HasLargeCSV checks whether input csvs is fit for Lightning import.
   169  // If strictFormat is false, and csv file is large. Lightning will have performance issue.
   170  // this test cannot be skipped.
   171  func (rc *Controller) HasLargeCSV(dbMetas []*mydump.MDDatabaseMeta) error {
   172  	passed := true
   173  	message := "Source csv files size is proper"
   174  	defer func() {
   175  		rc.checkTemplate.Collect(Warn, passed, message)
   176  	}()
   177  	if !rc.cfg.Mydumper.StrictFormat {
   178  		for _, db := range dbMetas {
   179  			for _, t := range db.Tables {
   180  				for _, f := range t.DataFiles {
   181  					if f.FileMeta.FileSize > defaultCSVSize {
   182  						message = fmt.Sprintf("large csv: %s file exists and it will slow down import performance", f.FileMeta.Path)
   183  						passed = false
   184  					}
   185  				}
   186  			}
   187  		}
   188  	} else {
   189  		message = "Skip the csv size check, because config.StrictFormat is true"
   190  	}
   191  	return nil
   192  }
   193  
   194  func (rc *Controller) EstimateSourceData(ctx context.Context) (int64, error) {
   195  	sourceSize := int64(0)
   196  	originSource := int64(0)
   197  	bigTableCount := 0
   198  	tableCount := 0
   199  	unSortedTableCount := 0
   200  	for _, db := range rc.dbMetas {
   201  		info, ok := rc.dbInfos[db.Name]
   202  		if !ok {
   203  			continue
   204  		}
   205  		for _, tbl := range db.Tables {
   206  			tableInfo, ok := info.Tables[tbl.Name]
   207  			if ok {
   208  				if err := rc.SampleDataFromTable(ctx, db.Name, tbl, tableInfo.Core); err != nil {
   209  					return sourceSize, errors.Trace(err)
   210  				}
   211  				sourceSize += int64(float64(tbl.TotalSize) * tbl.IndexRatio)
   212  				originSource += tbl.TotalSize
   213  				if tbl.TotalSize > int64(config.DefaultBatchSize)*2 {
   214  					bigTableCount += 1
   215  					if !tbl.IsRowOrdered {
   216  						unSortedTableCount += 1
   217  					}
   218  				}
   219  				tableCount += 1
   220  			}
   221  		}
   222  	}
   223  
   224  	// Do not import with too large concurrency because these data may be all unsorted.
   225  	if bigTableCount > 0 && unSortedTableCount > 0 {
   226  		if rc.cfg.App.TableConcurrency > rc.cfg.App.IndexConcurrency {
   227  			rc.cfg.App.TableConcurrency = rc.cfg.App.IndexConcurrency
   228  		}
   229  	}
   230  	return sourceSize, nil
   231  }
   232  
   233  // LocalResource checks the local node has enough resources for this import when local backend enabled;
   234  func (rc *Controller) LocalResource(ctx context.Context, sourceSize int64) error {
   235  	if rc.isSourceInLocal() {
   236  		sourceDir := strings.TrimPrefix(rc.cfg.Mydumper.SourceDir, storage.LocalURIPrefix)
   237  		same, err := common.SameDisk(sourceDir, rc.cfg.TikvImporter.SortedKVDir)
   238  		if err != nil {
   239  			return errors.Trace(err)
   240  		}
   241  		if same {
   242  			rc.checkTemplate.Collect(Warn, false,
   243  				fmt.Sprintf("sorted-kv-dir:%s and data-source-dir:%s are in the same disk, may slow down performance",
   244  					rc.cfg.TikvImporter.SortedKVDir, sourceDir))
   245  		}
   246  	}
   247  
   248  	storageSize, err := common.GetStorageSize(rc.cfg.TikvImporter.SortedKVDir)
   249  	if err != nil {
   250  		return errors.Trace(err)
   251  	}
   252  	localAvailable := storageSize.Available
   253  	if err = rc.taskMgr.InitTask(ctx, sourceSize); err != nil {
   254  		return errors.Trace(err)
   255  	}
   256  
   257  	var message string
   258  	var passed bool
   259  	switch {
   260  	case localAvailable > uint64(sourceSize):
   261  		message = fmt.Sprintf("local disk resources are rich, estimate sorted data size %s, local available is %s",
   262  			units.BytesSize(float64(sourceSize)), units.BytesSize(float64(localAvailable)))
   263  		passed = true
   264  	default:
   265  		if int64(rc.cfg.TikvImporter.DiskQuota) > int64(localAvailable) {
   266  			message = fmt.Sprintf("local disk space may not enough to finish import"+
   267  				"estimate sorted data size is %s, but local available is %s,"+
   268  				"you need a smaller number for tikv-importer.disk-quota (%s) to finish imports",
   269  				units.BytesSize(float64(sourceSize)),
   270  				units.BytesSize(float64(localAvailable)), units.BytesSize(float64(rc.cfg.TikvImporter.DiskQuota)))
   271  			passed = false
   272  			log.L().Error(message)
   273  		} else {
   274  			message = fmt.Sprintf("local disk space may not enough to finish import, "+
   275  				"estimate sorted data size is %s, but local available is %s,"+
   276  				"we will use disk-quota (size: %s) to finish imports, which may slow down import",
   277  				units.BytesSize(float64(sourceSize)),
   278  				units.BytesSize(float64(localAvailable)), units.BytesSize(float64(rc.cfg.TikvImporter.DiskQuota)))
   279  			passed = true
   280  			log.L().Warn(message)
   281  		}
   282  	}
   283  	rc.checkTemplate.Collect(Critical, passed, message)
   284  	return nil
   285  }
   286  
   287  // CheckpointIsValid checks whether we can start this import with this checkpoint.
   288  func (rc *Controller) CheckpointIsValid(ctx context.Context, tableInfo *mydump.MDTableMeta) ([]string, bool, error) {
   289  	msgs := make([]string, 0)
   290  	uniqueName := common.UniqueTable(tableInfo.DB, tableInfo.Name)
   291  	tableCheckPoint, err := rc.checkpointsDB.Get(ctx, uniqueName)
   292  	if err != nil {
   293  		// there is no checkpoint
   294  		log.L().Debug("no checkpoint detected", zap.String("table", uniqueName))
   295  		return nil, true, nil
   296  	}
   297  	// if checkpoint enable and not missing, we skip the check table empty progress.
   298  	if tableCheckPoint.Status <= checkpoints.CheckpointStatusMissing {
   299  		return nil, false, nil
   300  	}
   301  
   302  	var permFromCheckpoint []int
   303  	var columns []string
   304  	for _, eng := range tableCheckPoint.Engines {
   305  		if len(eng.Chunks) > 0 {
   306  			chunk := eng.Chunks[0]
   307  			permFromCheckpoint = chunk.ColumnPermutation
   308  			columns = chunk.Chunk.Columns
   309  			if filepath.Dir(chunk.FileMeta.Path) != rc.cfg.Mydumper.SourceDir {
   310  				message := fmt.Sprintf("chunk checkpoints path is not equal to config"+
   311  					"checkpoint is %s, config source dir is %s", chunk.FileMeta.Path, rc.cfg.Mydumper.SourceDir)
   312  				msgs = append(msgs, message)
   313  			}
   314  		}
   315  	}
   316  	if len(columns) == 0 {
   317  		log.L().Debug("no valid checkpoint detected", zap.String("table", uniqueName))
   318  		return nil, false, nil
   319  	}
   320  	info := rc.dbInfos[tableInfo.DB].Tables[tableInfo.Name]
   321  	if info != nil {
   322  		permFromTiDB, err := parseColumnPermutations(info.Core, columns, nil)
   323  		if err != nil {
   324  			msgs = append(msgs, fmt.Sprintf("failed to calculate columns %s, table %s's info has changed,"+
   325  				"consider remove this checkpoint, and start import again.", err.Error(), uniqueName))
   326  		}
   327  		if !reflect.DeepEqual(permFromCheckpoint, permFromTiDB) {
   328  			msgs = append(msgs, fmt.Sprintf("compare columns perm failed. table %s's info has changed,"+
   329  				"consider remove this checkpoint, and start import again.", uniqueName))
   330  		}
   331  	}
   332  	return msgs, false, nil
   333  }
   334  
   335  // hasDefault represents col has default value.
   336  func hasDefault(col *model.ColumnInfo) bool {
   337  	return col.DefaultIsExpr || col.DefaultValue != nil || !mysql.HasNotNullFlag(col.Flag) ||
   338  		col.IsGenerated() || mysql.HasAutoIncrementFlag(col.Flag)
   339  }
   340  
   341  func (rc *Controller) readColumnsAndCount(ctx context.Context, dataFileMeta mydump.SourceFileMeta) (cols []string, colCnt int, err error) {
   342  	var reader storage.ReadSeekCloser
   343  	if dataFileMeta.Type == mydump.SourceTypeParquet {
   344  		reader, err = mydump.OpenParquetReader(ctx, rc.store, dataFileMeta.Path, dataFileMeta.FileSize)
   345  	} else {
   346  		reader, err = rc.store.Open(ctx, dataFileMeta.Path)
   347  	}
   348  	if err != nil {
   349  		return nil, 0, errors.Trace(err)
   350  	}
   351  
   352  	var parser mydump.Parser
   353  	blockBufSize := int64(rc.cfg.Mydumper.ReadBlockSize)
   354  	switch dataFileMeta.Type {
   355  	case mydump.SourceTypeCSV:
   356  		hasHeader := rc.cfg.Mydumper.CSV.Header
   357  		parser = mydump.NewCSVParser(&rc.cfg.Mydumper.CSV, reader, blockBufSize, rc.ioWorkers, hasHeader)
   358  	case mydump.SourceTypeSQL:
   359  		parser = mydump.NewChunkParser(rc.cfg.TiDB.SQLMode, reader, blockBufSize, rc.ioWorkers)
   360  	case mydump.SourceTypeParquet:
   361  		parser, err = mydump.NewParquetParser(ctx, rc.store, reader, dataFileMeta.Path)
   362  		if err != nil {
   363  			return nil, 0, errors.Trace(err)
   364  		}
   365  	default:
   366  		panic(fmt.Sprintf("unknown file type '%s'", dataFileMeta.Type))
   367  	}
   368  	defer parser.Close()
   369  
   370  	err = parser.ReadRow()
   371  	if err != nil && errors.Cause(err) != io.EOF {
   372  		return nil, 0, errors.Trace(err)
   373  	}
   374  	return parser.Columns(), len(parser.LastRow().Row), nil
   375  }
   376  
   377  // SchemaIsValid checks the import file and cluster schema is match.
   378  func (rc *Controller) SchemaIsValid(ctx context.Context, tableInfo *mydump.MDTableMeta) ([]string, error) {
   379  	msgs := make([]string, 0)
   380  	info, ok := rc.dbInfos[tableInfo.DB].Tables[tableInfo.Name]
   381  	if !ok {
   382  		msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` doesn't exists,"+
   383  			"please give a schema file in source dir or create table manually", tableInfo.DB, tableInfo.Name))
   384  		return msgs, nil
   385  	}
   386  
   387  	igCols := make(map[string]struct{})
   388  	igCol, err := rc.cfg.Mydumper.IgnoreColumns.GetIgnoreColumns(tableInfo.DB, tableInfo.Name, rc.cfg.Mydumper.CaseSensitive)
   389  	if err != nil {
   390  		return nil, errors.Trace(err)
   391  	}
   392  	for _, col := range igCol.Columns {
   393  		igCols[col] = struct{}{}
   394  	}
   395  
   396  	if len(tableInfo.DataFiles) == 0 {
   397  		log.L().Info("no data files detected", zap.String("db", tableInfo.DB), zap.String("table", tableInfo.Name))
   398  		return nil, nil
   399  	}
   400  
   401  	colCountFromTiDB := len(info.Core.Columns)
   402  	core := info.Core
   403  	defaultCols := make(map[string]struct{})
   404  	for _, col := range core.Columns {
   405  		if hasDefault(col) || (info.Core.ContainsAutoRandomBits() && mysql.HasPriKeyFlag(col.Flag)) {
   406  			// this column has default value or it's auto random id, so we can ignore it
   407  			defaultCols[col.Name.L] = struct{}{}
   408  		}
   409  	}
   410  	// tidb_rowid have a default value.
   411  	defaultCols[model.ExtraHandleName.String()] = struct{}{}
   412  
   413  	for _, dataFile := range tableInfo.DataFiles {
   414  		// get columns name from data file.
   415  		dataFileMeta := dataFile.FileMeta
   416  
   417  		if tp := dataFileMeta.Type; tp != mydump.SourceTypeCSV && tp != mydump.SourceTypeSQL && tp != mydump.SourceTypeParquet {
   418  			msgs = append(msgs, fmt.Sprintf("file '%s' with unknown source type '%s'", dataFileMeta.Path, dataFileMeta.Type.String()))
   419  			return msgs, nil
   420  		}
   421  		colsFromDataFile, colCountFromDataFile, err := rc.readColumnsAndCount(ctx, dataFileMeta)
   422  		if err != nil {
   423  			return nil, errors.Trace(err)
   424  		}
   425  		if colsFromDataFile == nil && colCountFromDataFile == 0 {
   426  			log.L().Info("file contains no data, skip checking against schema validity", zap.String("path", dataFileMeta.Path))
   427  			continue
   428  		}
   429  
   430  		if colsFromDataFile == nil {
   431  			// when there is no columns name in data file. we must insert data in order.
   432  			// so the last several columns either can be ignored or has a default value.
   433  			for i := colCountFromDataFile; i < colCountFromTiDB; i++ {
   434  				if _, ok := defaultCols[core.Columns[i].Name.L]; !ok {
   435  					msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` has %d columns,"+
   436  						"and data file has %d columns, but column %s are missing the default value,"+
   437  						"please give column a default value to skip this check",
   438  						tableInfo.DB, tableInfo.Name, colCountFromTiDB, colCountFromDataFile, core.Columns[i].Name.L))
   439  				}
   440  			}
   441  		} else {
   442  			// compare column names and make sure
   443  			// 1. TiDB table info has data file's all columns(besides ignore columns)
   444  			// 2. Those columns not introduced in data file always have a default value.
   445  			colMap := make(map[string]struct{})
   446  			for col := range igCols {
   447  				colMap[col] = struct{}{}
   448  			}
   449  			for _, col := range core.Columns {
   450  				if _, ok := colMap[col.Name.L]; ok {
   451  					// tidb's column is ignored
   452  					// we need ensure this column has the default value.
   453  					if _, hasDefault := defaultCols[col.Name.L]; !hasDefault {
   454  						msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s`'s column %s cannot be ignored,"+
   455  							"because it doesn't hava a default value, please set tables.ignoreColumns properly",
   456  							tableInfo.DB, tableInfo.Name, col.Name.L))
   457  					}
   458  				} else {
   459  					colMap[col.Name.L] = struct{}{}
   460  				}
   461  			}
   462  			// tidb_rowid can be ignored in check
   463  			colMap[model.ExtraHandleName.String()] = struct{}{}
   464  			for _, col := range colsFromDataFile {
   465  				if _, ok := colMap[col]; !ok {
   466  					checkMsg := "please check table schema"
   467  					if dataFileMeta.Type == mydump.SourceTypeCSV && rc.cfg.Mydumper.CSV.Header {
   468  						checkMsg += " and csv file header"
   469  					}
   470  					msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` doesn't have column %s, "+
   471  						"%s or use tables.ignoreColumns to ignore %s",
   472  						tableInfo.DB, tableInfo.Name, col, checkMsg, col))
   473  				} else {
   474  					// remove column for next iteration
   475  					delete(colMap, col)
   476  				}
   477  			}
   478  			// if theses rest columns don't have a default value.
   479  			for col := range colMap {
   480  				if _, ok := defaultCols[col]; ok {
   481  					continue
   482  				}
   483  				msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` doesn't have the default value for %s"+
   484  					"please give a default value for %s or choose another column to ignore or add this column in data file",
   485  					tableInfo.DB, tableInfo.Name, col, col))
   486  			}
   487  		}
   488  		if len(msgs) > 0 {
   489  			return msgs, nil
   490  		}
   491  	}
   492  	return msgs, nil
   493  }
   494  
   495  func (rc *Controller) SampleDataFromTable(ctx context.Context, dbName string, tableMeta *mydump.MDTableMeta, tableInfo *model.TableInfo) error {
   496  	if len(tableMeta.DataFiles) == 0 {
   497  		return nil
   498  	}
   499  	sampleFile := tableMeta.DataFiles[0].FileMeta
   500  	var reader storage.ReadSeekCloser
   501  	var err error
   502  	if sampleFile.Type == mydump.SourceTypeParquet {
   503  		reader, err = mydump.OpenParquetReader(ctx, rc.store, sampleFile.Path, sampleFile.FileSize)
   504  	} else {
   505  		reader, err = rc.store.Open(ctx, sampleFile.Path)
   506  	}
   507  	if err != nil {
   508  		return errors.Trace(err)
   509  	}
   510  	idAlloc := kv.NewPanickingAllocators(0)
   511  	tbl, err := tables.TableFromMeta(idAlloc, tableInfo)
   512  
   513  	kvEncoder, err := rc.backend.NewEncoder(tbl, &kv.SessionOptions{
   514  		SQLMode:        rc.cfg.TiDB.SQLMode,
   515  		Timestamp:      0,
   516  		SysVars:        rc.sysVars,
   517  		AutoRandomSeed: 0,
   518  	})
   519  	blockBufSize := int64(rc.cfg.Mydumper.ReadBlockSize)
   520  
   521  	var parser mydump.Parser
   522  	switch tableMeta.DataFiles[0].FileMeta.Type {
   523  	case mydump.SourceTypeCSV:
   524  		hasHeader := rc.cfg.Mydumper.CSV.Header
   525  		parser = mydump.NewCSVParser(&rc.cfg.Mydumper.CSV, reader, blockBufSize, rc.ioWorkers, hasHeader)
   526  	case mydump.SourceTypeSQL:
   527  		parser = mydump.NewChunkParser(rc.cfg.TiDB.SQLMode, reader, blockBufSize, rc.ioWorkers)
   528  	case mydump.SourceTypeParquet:
   529  		parser, err = mydump.NewParquetParser(ctx, rc.store, reader, sampleFile.Path)
   530  		if err != nil {
   531  			return errors.Trace(err)
   532  		}
   533  	default:
   534  		panic(fmt.Sprintf("file '%s' with unknown source type '%s'", sampleFile.Path, sampleFile.Type.String()))
   535  	}
   536  	defer parser.Close()
   537  	logTask := log.With(zap.String("table", tableMeta.Name)).Begin(zap.InfoLevel, "sample file")
   538  	igCols, err := rc.cfg.Mydumper.IgnoreColumns.GetIgnoreColumns(dbName, tableMeta.Name, rc.cfg.Mydumper.CaseSensitive)
   539  	if err != nil {
   540  		return errors.Trace(err)
   541  	}
   542  
   543  	initializedColumns, reachEOF := false, false
   544  	var columnPermutation []int
   545  	var kvSize uint64 = 0
   546  	var rowSize uint64 = 0
   547  	rowCount := 0
   548  	dataKVs := rc.backend.MakeEmptyRows()
   549  	indexKVs := rc.backend.MakeEmptyRows()
   550  	lastKey := make([]byte, 0)
   551  	tableMeta.IsRowOrdered = true
   552  	tableMeta.IndexRatio = 1.0
   553  outloop:
   554  	for !reachEOF {
   555  		offset, _ := parser.Pos()
   556  		err = parser.ReadRow()
   557  		columnNames := parser.Columns()
   558  
   559  		switch errors.Cause(err) {
   560  		case nil:
   561  			if !initializedColumns {
   562  				if len(columnPermutation) == 0 {
   563  					columnPermutation, err = createColumnPermutation(columnNames, igCols.Columns, tableInfo)
   564  					if err != nil {
   565  						return errors.Trace(err)
   566  					}
   567  				}
   568  				initializedColumns = true
   569  			}
   570  		case io.EOF:
   571  			reachEOF = true
   572  			break outloop
   573  		default:
   574  			err = errors.Annotatef(err, "in file offset %d", offset)
   575  			return errors.Trace(err)
   576  		}
   577  		lastRow := parser.LastRow()
   578  		rowSize += uint64(lastRow.Length)
   579  		rowCount += 1
   580  
   581  		var dataChecksum, indexChecksum verification.KVChecksum
   582  		kvs, encodeErr := kvEncoder.Encode(logTask.Logger, lastRow.Row, lastRow.RowID, columnPermutation, offset)
   583  		parser.RecycleRow(lastRow)
   584  		if encodeErr != nil {
   585  			err = errors.Annotatef(encodeErr, "in file at offset %d", offset)
   586  			return errors.Trace(err)
   587  		}
   588  		if tableMeta.IsRowOrdered {
   589  			kvs.ClassifyAndAppend(&dataKVs, &dataChecksum, &indexKVs, &indexChecksum)
   590  			for _, kv := range kv.KvPairsFromRows(dataKVs) {
   591  				if len(lastKey) == 0 {
   592  					lastKey = kv.Key
   593  				} else if bytes.Compare(lastKey, kv.Key) > 0 {
   594  					tableMeta.IsRowOrdered = false
   595  					break
   596  				}
   597  			}
   598  			dataKVs = dataKVs.Clear()
   599  			indexKVs = indexKVs.Clear()
   600  		}
   601  		kvSize += kvs.Size()
   602  
   603  		failpoint.Inject("mock-kv-size", func(val failpoint.Value) {
   604  			kvSize += uint64(val.(int))
   605  		})
   606  		if rowSize > maxSampleDataSize && rowCount > maxSampleRowCount {
   607  			break
   608  		}
   609  	}
   610  
   611  	if rowSize > 0 && kvSize > rowSize {
   612  		tableMeta.IndexRatio = float64(kvSize) / float64(rowSize)
   613  	}
   614  	log.L().Info("Sample source data", zap.String("table", tableMeta.Name), zap.Float64("IndexRatio", tableMeta.IndexRatio), zap.Bool("IsSourceOrder", tableMeta.IsRowOrdered))
   615  	return nil
   616  }