github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/restore/check_info.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package restore 15 16 import ( 17 "bytes" 18 "context" 19 "fmt" 20 "io" 21 "path/filepath" 22 "reflect" 23 "strings" 24 25 "github.com/docker/go-units" 26 "github.com/pingcap/errors" 27 "github.com/pingcap/failpoint" 28 "github.com/pingcap/parser/model" 29 "github.com/pingcap/parser/mysql" 30 "github.com/pingcap/tidb/table/tables" 31 "github.com/tikv/pd/pkg/typeutil" 32 "github.com/tikv/pd/server/api" 33 pdconfig "github.com/tikv/pd/server/config" 34 "go.uber.org/zap" 35 36 "github.com/pingcap/br/pkg/lightning/backend" 37 "github.com/pingcap/br/pkg/lightning/backend/kv" 38 "github.com/pingcap/br/pkg/lightning/checkpoints" 39 "github.com/pingcap/br/pkg/lightning/common" 40 "github.com/pingcap/br/pkg/lightning/config" 41 "github.com/pingcap/br/pkg/lightning/log" 42 "github.com/pingcap/br/pkg/lightning/mydump" 43 "github.com/pingcap/br/pkg/lightning/verification" 44 "github.com/pingcap/br/pkg/storage" 45 ) 46 47 const ( 48 pdWriteFlow = "/pd/api/v1/regions/writeflow" 49 pdReadFlow = "/pd/api/v1/regions/readflow" 50 51 // OnlineBytesLimitation/OnlineKeysLimitation is the statistics of 52 // Bytes/Keys used per region from pdWriteFlow/pdReadFlow 53 // this determines whether the cluster has some region that have other loads 54 // and might influence the import task in the future. 55 OnlineBytesLimitation = 10 * units.MiB 56 OnlineKeysLimitation = 5000 57 58 pdStores = "/pd/api/v1/stores" 59 pdReplicate = "/pd/api/v1/config/replicate" 60 61 defaultCSVSize = 10 * units.GiB 62 maxSampleDataSize = 10 * 1024 * 1024 63 maxSampleRowCount = 10 * 1024 64 ) 65 66 func (rc *Controller) isSourceInLocal() bool { 67 return strings.HasPrefix(rc.store.URI(), storage.LocalURIPrefix) 68 } 69 70 func (rc *Controller) getReplicaCount(ctx context.Context) (uint64, error) { 71 result := &pdconfig.ReplicationConfig{} 72 err := rc.tls.WithHost(rc.cfg.TiDB.PdAddr).GetJSON(ctx, pdReplicate, &result) 73 if err != nil { 74 return 0, errors.Trace(err) 75 } 76 return result.MaxReplicas, nil 77 } 78 79 // ClusterResource check cluster has enough resource to import data. this test can by skipped. 80 func (rc *Controller) ClusterResource(ctx context.Context, localSource int64) error { 81 passed := true 82 message := "Cluster resources are rich for this import task" 83 defer func() { 84 rc.checkTemplate.Collect(Critical, passed, message) 85 }() 86 87 result := &api.StoresInfo{} 88 err := rc.tls.WithHost(rc.cfg.TiDB.PdAddr).GetJSON(ctx, pdStores, result) 89 if err != nil { 90 return errors.Trace(err) 91 } 92 totalCapacity := typeutil.ByteSize(0) 93 for _, store := range result.Stores { 94 totalCapacity += store.Status.Capacity 95 } 96 clusterSource := localSource 97 if rc.taskMgr != nil { 98 clusterSource, err = rc.taskMgr.CheckClusterSource(ctx) 99 if err != nil { 100 return errors.Trace(err) 101 } 102 } 103 104 replicaCount, err := rc.getReplicaCount(ctx) 105 if err != nil { 106 return errors.Trace(err) 107 } 108 estimateSize := uint64(clusterSource) * replicaCount 109 if typeutil.ByteSize(estimateSize) > totalCapacity { 110 passed = false 111 message = fmt.Sprintf("Cluster doesn't have enough space, capacity is %s, but we need %s", 112 units.BytesSize(float64(totalCapacity)), units.BytesSize(float64(estimateSize))) 113 } else { 114 message = fmt.Sprintf("Cluster capacity is rich, capacity is %s, we need %s", 115 units.BytesSize(float64(totalCapacity)), units.BytesSize(float64(estimateSize))) 116 } 117 return nil 118 } 119 120 // ClusterIsAvailable check cluster is available to import data. this test can be skipped. 121 func (rc *Controller) ClusterIsAvailable(ctx context.Context) error { 122 passed := true 123 message := "Cluster is available" 124 defer func() { 125 rc.checkTemplate.Collect(Critical, passed, message) 126 }() 127 // skip requirement check if explicitly turned off 128 if !rc.cfg.App.CheckRequirements { 129 message = "Cluster's available check is skipped by user requirement" 130 return nil 131 } 132 checkCtx := &backend.CheckCtx{ 133 DBMetas: rc.dbMetas, 134 } 135 if err := rc.backend.CheckRequirements(ctx, checkCtx); err != nil { 136 passed = false 137 message = fmt.Sprintf("cluster available check failed: %s", err.Error()) 138 } 139 return nil 140 } 141 142 // StoragePermission checks whether Lightning has enough permission to storage. 143 // this test cannot be skipped. 144 func (rc *Controller) StoragePermission(ctx context.Context) error { 145 passed := true 146 message := "Lightning has the correct storage permission" 147 defer func() { 148 rc.checkTemplate.Collect(Critical, passed, message) 149 }() 150 151 u, err := storage.ParseBackend(rc.cfg.Mydumper.SourceDir, nil) 152 if err != nil { 153 return errors.Annotate(err, "parse backend failed") 154 } 155 _, err = storage.New(ctx, u, &storage.ExternalStorageOptions{ 156 CheckPermissions: []storage.Permission{ 157 storage.ListObjects, 158 storage.GetObject, 159 }, 160 }) 161 if err != nil { 162 passed = false 163 message = err.Error() 164 } 165 return nil 166 } 167 168 // HasLargeCSV checks whether input csvs is fit for Lightning import. 169 // If strictFormat is false, and csv file is large. Lightning will have performance issue. 170 // this test cannot be skipped. 171 func (rc *Controller) HasLargeCSV(dbMetas []*mydump.MDDatabaseMeta) error { 172 passed := true 173 message := "Source csv files size is proper" 174 defer func() { 175 rc.checkTemplate.Collect(Warn, passed, message) 176 }() 177 if !rc.cfg.Mydumper.StrictFormat { 178 for _, db := range dbMetas { 179 for _, t := range db.Tables { 180 for _, f := range t.DataFiles { 181 if f.FileMeta.FileSize > defaultCSVSize { 182 message = fmt.Sprintf("large csv: %s file exists and it will slow down import performance", f.FileMeta.Path) 183 passed = false 184 } 185 } 186 } 187 } 188 } else { 189 message = "Skip the csv size check, because config.StrictFormat is true" 190 } 191 return nil 192 } 193 194 func (rc *Controller) EstimateSourceData(ctx context.Context) (int64, error) { 195 sourceSize := int64(0) 196 originSource := int64(0) 197 bigTableCount := 0 198 tableCount := 0 199 unSortedTableCount := 0 200 for _, db := range rc.dbMetas { 201 info, ok := rc.dbInfos[db.Name] 202 if !ok { 203 continue 204 } 205 for _, tbl := range db.Tables { 206 tableInfo, ok := info.Tables[tbl.Name] 207 if ok { 208 if err := rc.SampleDataFromTable(ctx, db.Name, tbl, tableInfo.Core); err != nil { 209 return sourceSize, errors.Trace(err) 210 } 211 sourceSize += int64(float64(tbl.TotalSize) * tbl.IndexRatio) 212 originSource += tbl.TotalSize 213 if tbl.TotalSize > int64(config.DefaultBatchSize)*2 { 214 bigTableCount += 1 215 if !tbl.IsRowOrdered { 216 unSortedTableCount += 1 217 } 218 } 219 tableCount += 1 220 } 221 } 222 } 223 224 // Do not import with too large concurrency because these data may be all unsorted. 225 if bigTableCount > 0 && unSortedTableCount > 0 { 226 if rc.cfg.App.TableConcurrency > rc.cfg.App.IndexConcurrency { 227 rc.cfg.App.TableConcurrency = rc.cfg.App.IndexConcurrency 228 } 229 } 230 return sourceSize, nil 231 } 232 233 // LocalResource checks the local node has enough resources for this import when local backend enabled; 234 func (rc *Controller) LocalResource(ctx context.Context, sourceSize int64) error { 235 if rc.isSourceInLocal() { 236 sourceDir := strings.TrimPrefix(rc.cfg.Mydumper.SourceDir, storage.LocalURIPrefix) 237 same, err := common.SameDisk(sourceDir, rc.cfg.TikvImporter.SortedKVDir) 238 if err != nil { 239 return errors.Trace(err) 240 } 241 if same { 242 rc.checkTemplate.Collect(Warn, false, 243 fmt.Sprintf("sorted-kv-dir:%s and data-source-dir:%s are in the same disk, may slow down performance", 244 rc.cfg.TikvImporter.SortedKVDir, sourceDir)) 245 } 246 } 247 248 storageSize, err := common.GetStorageSize(rc.cfg.TikvImporter.SortedKVDir) 249 if err != nil { 250 return errors.Trace(err) 251 } 252 localAvailable := storageSize.Available 253 if err = rc.taskMgr.InitTask(ctx, sourceSize); err != nil { 254 return errors.Trace(err) 255 } 256 257 var message string 258 var passed bool 259 switch { 260 case localAvailable > uint64(sourceSize): 261 message = fmt.Sprintf("local disk resources are rich, estimate sorted data size %s, local available is %s", 262 units.BytesSize(float64(sourceSize)), units.BytesSize(float64(localAvailable))) 263 passed = true 264 default: 265 if int64(rc.cfg.TikvImporter.DiskQuota) > int64(localAvailable) { 266 message = fmt.Sprintf("local disk space may not enough to finish import"+ 267 "estimate sorted data size is %s, but local available is %s,"+ 268 "you need a smaller number for tikv-importer.disk-quota (%s) to finish imports", 269 units.BytesSize(float64(sourceSize)), 270 units.BytesSize(float64(localAvailable)), units.BytesSize(float64(rc.cfg.TikvImporter.DiskQuota))) 271 passed = false 272 log.L().Error(message) 273 } else { 274 message = fmt.Sprintf("local disk space may not enough to finish import, "+ 275 "estimate sorted data size is %s, but local available is %s,"+ 276 "we will use disk-quota (size: %s) to finish imports, which may slow down import", 277 units.BytesSize(float64(sourceSize)), 278 units.BytesSize(float64(localAvailable)), units.BytesSize(float64(rc.cfg.TikvImporter.DiskQuota))) 279 passed = true 280 log.L().Warn(message) 281 } 282 } 283 rc.checkTemplate.Collect(Critical, passed, message) 284 return nil 285 } 286 287 // CheckpointIsValid checks whether we can start this import with this checkpoint. 288 func (rc *Controller) CheckpointIsValid(ctx context.Context, tableInfo *mydump.MDTableMeta) ([]string, bool, error) { 289 msgs := make([]string, 0) 290 uniqueName := common.UniqueTable(tableInfo.DB, tableInfo.Name) 291 tableCheckPoint, err := rc.checkpointsDB.Get(ctx, uniqueName) 292 if err != nil { 293 // there is no checkpoint 294 log.L().Debug("no checkpoint detected", zap.String("table", uniqueName)) 295 return nil, true, nil 296 } 297 // if checkpoint enable and not missing, we skip the check table empty progress. 298 if tableCheckPoint.Status <= checkpoints.CheckpointStatusMissing { 299 return nil, false, nil 300 } 301 302 var permFromCheckpoint []int 303 var columns []string 304 for _, eng := range tableCheckPoint.Engines { 305 if len(eng.Chunks) > 0 { 306 chunk := eng.Chunks[0] 307 permFromCheckpoint = chunk.ColumnPermutation 308 columns = chunk.Chunk.Columns 309 if filepath.Dir(chunk.FileMeta.Path) != rc.cfg.Mydumper.SourceDir { 310 message := fmt.Sprintf("chunk checkpoints path is not equal to config"+ 311 "checkpoint is %s, config source dir is %s", chunk.FileMeta.Path, rc.cfg.Mydumper.SourceDir) 312 msgs = append(msgs, message) 313 } 314 } 315 } 316 if len(columns) == 0 { 317 log.L().Debug("no valid checkpoint detected", zap.String("table", uniqueName)) 318 return nil, false, nil 319 } 320 info := rc.dbInfos[tableInfo.DB].Tables[tableInfo.Name] 321 if info != nil { 322 permFromTiDB, err := parseColumnPermutations(info.Core, columns, nil) 323 if err != nil { 324 msgs = append(msgs, fmt.Sprintf("failed to calculate columns %s, table %s's info has changed,"+ 325 "consider remove this checkpoint, and start import again.", err.Error(), uniqueName)) 326 } 327 if !reflect.DeepEqual(permFromCheckpoint, permFromTiDB) { 328 msgs = append(msgs, fmt.Sprintf("compare columns perm failed. table %s's info has changed,"+ 329 "consider remove this checkpoint, and start import again.", uniqueName)) 330 } 331 } 332 return msgs, false, nil 333 } 334 335 // hasDefault represents col has default value. 336 func hasDefault(col *model.ColumnInfo) bool { 337 return col.DefaultIsExpr || col.DefaultValue != nil || !mysql.HasNotNullFlag(col.Flag) || 338 col.IsGenerated() || mysql.HasAutoIncrementFlag(col.Flag) 339 } 340 341 func (rc *Controller) readColumnsAndCount(ctx context.Context, dataFileMeta mydump.SourceFileMeta) (cols []string, colCnt int, err error) { 342 var reader storage.ReadSeekCloser 343 if dataFileMeta.Type == mydump.SourceTypeParquet { 344 reader, err = mydump.OpenParquetReader(ctx, rc.store, dataFileMeta.Path, dataFileMeta.FileSize) 345 } else { 346 reader, err = rc.store.Open(ctx, dataFileMeta.Path) 347 } 348 if err != nil { 349 return nil, 0, errors.Trace(err) 350 } 351 352 var parser mydump.Parser 353 blockBufSize := int64(rc.cfg.Mydumper.ReadBlockSize) 354 switch dataFileMeta.Type { 355 case mydump.SourceTypeCSV: 356 hasHeader := rc.cfg.Mydumper.CSV.Header 357 parser = mydump.NewCSVParser(&rc.cfg.Mydumper.CSV, reader, blockBufSize, rc.ioWorkers, hasHeader) 358 case mydump.SourceTypeSQL: 359 parser = mydump.NewChunkParser(rc.cfg.TiDB.SQLMode, reader, blockBufSize, rc.ioWorkers) 360 case mydump.SourceTypeParquet: 361 parser, err = mydump.NewParquetParser(ctx, rc.store, reader, dataFileMeta.Path) 362 if err != nil { 363 return nil, 0, errors.Trace(err) 364 } 365 default: 366 panic(fmt.Sprintf("unknown file type '%s'", dataFileMeta.Type)) 367 } 368 defer parser.Close() 369 370 err = parser.ReadRow() 371 if err != nil && errors.Cause(err) != io.EOF { 372 return nil, 0, errors.Trace(err) 373 } 374 return parser.Columns(), len(parser.LastRow().Row), nil 375 } 376 377 // SchemaIsValid checks the import file and cluster schema is match. 378 func (rc *Controller) SchemaIsValid(ctx context.Context, tableInfo *mydump.MDTableMeta) ([]string, error) { 379 msgs := make([]string, 0) 380 info, ok := rc.dbInfos[tableInfo.DB].Tables[tableInfo.Name] 381 if !ok { 382 msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` doesn't exists,"+ 383 "please give a schema file in source dir or create table manually", tableInfo.DB, tableInfo.Name)) 384 return msgs, nil 385 } 386 387 igCols := make(map[string]struct{}) 388 igCol, err := rc.cfg.Mydumper.IgnoreColumns.GetIgnoreColumns(tableInfo.DB, tableInfo.Name, rc.cfg.Mydumper.CaseSensitive) 389 if err != nil { 390 return nil, errors.Trace(err) 391 } 392 for _, col := range igCol.Columns { 393 igCols[col] = struct{}{} 394 } 395 396 if len(tableInfo.DataFiles) == 0 { 397 log.L().Info("no data files detected", zap.String("db", tableInfo.DB), zap.String("table", tableInfo.Name)) 398 return nil, nil 399 } 400 401 colCountFromTiDB := len(info.Core.Columns) 402 core := info.Core 403 defaultCols := make(map[string]struct{}) 404 for _, col := range core.Columns { 405 if hasDefault(col) || (info.Core.ContainsAutoRandomBits() && mysql.HasPriKeyFlag(col.Flag)) { 406 // this column has default value or it's auto random id, so we can ignore it 407 defaultCols[col.Name.L] = struct{}{} 408 } 409 } 410 // tidb_rowid have a default value. 411 defaultCols[model.ExtraHandleName.String()] = struct{}{} 412 413 for _, dataFile := range tableInfo.DataFiles { 414 // get columns name from data file. 415 dataFileMeta := dataFile.FileMeta 416 417 if tp := dataFileMeta.Type; tp != mydump.SourceTypeCSV && tp != mydump.SourceTypeSQL && tp != mydump.SourceTypeParquet { 418 msgs = append(msgs, fmt.Sprintf("file '%s' with unknown source type '%s'", dataFileMeta.Path, dataFileMeta.Type.String())) 419 return msgs, nil 420 } 421 colsFromDataFile, colCountFromDataFile, err := rc.readColumnsAndCount(ctx, dataFileMeta) 422 if err != nil { 423 return nil, errors.Trace(err) 424 } 425 if colsFromDataFile == nil && colCountFromDataFile == 0 { 426 log.L().Info("file contains no data, skip checking against schema validity", zap.String("path", dataFileMeta.Path)) 427 continue 428 } 429 430 if colsFromDataFile == nil { 431 // when there is no columns name in data file. we must insert data in order. 432 // so the last several columns either can be ignored or has a default value. 433 for i := colCountFromDataFile; i < colCountFromTiDB; i++ { 434 if _, ok := defaultCols[core.Columns[i].Name.L]; !ok { 435 msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` has %d columns,"+ 436 "and data file has %d columns, but column %s are missing the default value,"+ 437 "please give column a default value to skip this check", 438 tableInfo.DB, tableInfo.Name, colCountFromTiDB, colCountFromDataFile, core.Columns[i].Name.L)) 439 } 440 } 441 } else { 442 // compare column names and make sure 443 // 1. TiDB table info has data file's all columns(besides ignore columns) 444 // 2. Those columns not introduced in data file always have a default value. 445 colMap := make(map[string]struct{}) 446 for col := range igCols { 447 colMap[col] = struct{}{} 448 } 449 for _, col := range core.Columns { 450 if _, ok := colMap[col.Name.L]; ok { 451 // tidb's column is ignored 452 // we need ensure this column has the default value. 453 if _, hasDefault := defaultCols[col.Name.L]; !hasDefault { 454 msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s`'s column %s cannot be ignored,"+ 455 "because it doesn't hava a default value, please set tables.ignoreColumns properly", 456 tableInfo.DB, tableInfo.Name, col.Name.L)) 457 } 458 } else { 459 colMap[col.Name.L] = struct{}{} 460 } 461 } 462 // tidb_rowid can be ignored in check 463 colMap[model.ExtraHandleName.String()] = struct{}{} 464 for _, col := range colsFromDataFile { 465 if _, ok := colMap[col]; !ok { 466 checkMsg := "please check table schema" 467 if dataFileMeta.Type == mydump.SourceTypeCSV && rc.cfg.Mydumper.CSV.Header { 468 checkMsg += " and csv file header" 469 } 470 msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` doesn't have column %s, "+ 471 "%s or use tables.ignoreColumns to ignore %s", 472 tableInfo.DB, tableInfo.Name, col, checkMsg, col)) 473 } else { 474 // remove column for next iteration 475 delete(colMap, col) 476 } 477 } 478 // if theses rest columns don't have a default value. 479 for col := range colMap { 480 if _, ok := defaultCols[col]; ok { 481 continue 482 } 483 msgs = append(msgs, fmt.Sprintf("TiDB schema `%s`.`%s` doesn't have the default value for %s"+ 484 "please give a default value for %s or choose another column to ignore or add this column in data file", 485 tableInfo.DB, tableInfo.Name, col, col)) 486 } 487 } 488 if len(msgs) > 0 { 489 return msgs, nil 490 } 491 } 492 return msgs, nil 493 } 494 495 func (rc *Controller) SampleDataFromTable(ctx context.Context, dbName string, tableMeta *mydump.MDTableMeta, tableInfo *model.TableInfo) error { 496 if len(tableMeta.DataFiles) == 0 { 497 return nil 498 } 499 sampleFile := tableMeta.DataFiles[0].FileMeta 500 var reader storage.ReadSeekCloser 501 var err error 502 if sampleFile.Type == mydump.SourceTypeParquet { 503 reader, err = mydump.OpenParquetReader(ctx, rc.store, sampleFile.Path, sampleFile.FileSize) 504 } else { 505 reader, err = rc.store.Open(ctx, sampleFile.Path) 506 } 507 if err != nil { 508 return errors.Trace(err) 509 } 510 idAlloc := kv.NewPanickingAllocators(0) 511 tbl, err := tables.TableFromMeta(idAlloc, tableInfo) 512 513 kvEncoder, err := rc.backend.NewEncoder(tbl, &kv.SessionOptions{ 514 SQLMode: rc.cfg.TiDB.SQLMode, 515 Timestamp: 0, 516 SysVars: rc.sysVars, 517 AutoRandomSeed: 0, 518 }) 519 blockBufSize := int64(rc.cfg.Mydumper.ReadBlockSize) 520 521 var parser mydump.Parser 522 switch tableMeta.DataFiles[0].FileMeta.Type { 523 case mydump.SourceTypeCSV: 524 hasHeader := rc.cfg.Mydumper.CSV.Header 525 parser = mydump.NewCSVParser(&rc.cfg.Mydumper.CSV, reader, blockBufSize, rc.ioWorkers, hasHeader) 526 case mydump.SourceTypeSQL: 527 parser = mydump.NewChunkParser(rc.cfg.TiDB.SQLMode, reader, blockBufSize, rc.ioWorkers) 528 case mydump.SourceTypeParquet: 529 parser, err = mydump.NewParquetParser(ctx, rc.store, reader, sampleFile.Path) 530 if err != nil { 531 return errors.Trace(err) 532 } 533 default: 534 panic(fmt.Sprintf("file '%s' with unknown source type '%s'", sampleFile.Path, sampleFile.Type.String())) 535 } 536 defer parser.Close() 537 logTask := log.With(zap.String("table", tableMeta.Name)).Begin(zap.InfoLevel, "sample file") 538 igCols, err := rc.cfg.Mydumper.IgnoreColumns.GetIgnoreColumns(dbName, tableMeta.Name, rc.cfg.Mydumper.CaseSensitive) 539 if err != nil { 540 return errors.Trace(err) 541 } 542 543 initializedColumns, reachEOF := false, false 544 var columnPermutation []int 545 var kvSize uint64 = 0 546 var rowSize uint64 = 0 547 rowCount := 0 548 dataKVs := rc.backend.MakeEmptyRows() 549 indexKVs := rc.backend.MakeEmptyRows() 550 lastKey := make([]byte, 0) 551 tableMeta.IsRowOrdered = true 552 tableMeta.IndexRatio = 1.0 553 outloop: 554 for !reachEOF { 555 offset, _ := parser.Pos() 556 err = parser.ReadRow() 557 columnNames := parser.Columns() 558 559 switch errors.Cause(err) { 560 case nil: 561 if !initializedColumns { 562 if len(columnPermutation) == 0 { 563 columnPermutation, err = createColumnPermutation(columnNames, igCols.Columns, tableInfo) 564 if err != nil { 565 return errors.Trace(err) 566 } 567 } 568 initializedColumns = true 569 } 570 case io.EOF: 571 reachEOF = true 572 break outloop 573 default: 574 err = errors.Annotatef(err, "in file offset %d", offset) 575 return errors.Trace(err) 576 } 577 lastRow := parser.LastRow() 578 rowSize += uint64(lastRow.Length) 579 rowCount += 1 580 581 var dataChecksum, indexChecksum verification.KVChecksum 582 kvs, encodeErr := kvEncoder.Encode(logTask.Logger, lastRow.Row, lastRow.RowID, columnPermutation, offset) 583 parser.RecycleRow(lastRow) 584 if encodeErr != nil { 585 err = errors.Annotatef(encodeErr, "in file at offset %d", offset) 586 return errors.Trace(err) 587 } 588 if tableMeta.IsRowOrdered { 589 kvs.ClassifyAndAppend(&dataKVs, &dataChecksum, &indexKVs, &indexChecksum) 590 for _, kv := range kv.KvPairsFromRows(dataKVs) { 591 if len(lastKey) == 0 { 592 lastKey = kv.Key 593 } else if bytes.Compare(lastKey, kv.Key) > 0 { 594 tableMeta.IsRowOrdered = false 595 break 596 } 597 } 598 dataKVs = dataKVs.Clear() 599 indexKVs = indexKVs.Clear() 600 } 601 kvSize += kvs.Size() 602 603 failpoint.Inject("mock-kv-size", func(val failpoint.Value) { 604 kvSize += uint64(val.(int)) 605 }) 606 if rowSize > maxSampleDataSize && rowCount > maxSampleRowCount { 607 break 608 } 609 } 610 611 if rowSize > 0 && kvSize > rowSize { 612 tableMeta.IndexRatio = float64(kvSize) / float64(rowSize) 613 } 614 log.L().Info("Sample source data", zap.String("table", tableMeta.Name), zap.Float64("IndexRatio", tableMeta.IndexRatio), zap.Bool("IsSourceOrder", tableMeta.IsRowOrdered)) 615 return nil 616 }