github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/task/backup.go (about) 1 // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0. 2 3 package task 4 5 import ( 6 "context" 7 "fmt" 8 "os" 9 "strconv" 10 "strings" 11 "time" 12 13 "github.com/docker/go-units" 14 "github.com/opentracing/opentracing-go" 15 "github.com/pingcap/errors" 16 "github.com/pingcap/failpoint" 17 backuppb "github.com/pingcap/kvproto/pkg/backup" 18 "github.com/pingcap/log" 19 "github.com/pingcap/parser/mysql" 20 "github.com/pingcap/tidb/sessionctx/stmtctx" 21 "github.com/pingcap/tidb/statistics/handle" 22 "github.com/pingcap/tidb/types" 23 "github.com/spf13/pflag" 24 "github.com/tikv/client-go/v2/oracle" 25 "go.uber.org/zap" 26 27 "github.com/pingcap/br/pkg/backup" 28 "github.com/pingcap/br/pkg/checksum" 29 berrors "github.com/pingcap/br/pkg/errors" 30 "github.com/pingcap/br/pkg/glue" 31 "github.com/pingcap/br/pkg/logutil" 32 "github.com/pingcap/br/pkg/metautil" 33 "github.com/pingcap/br/pkg/storage" 34 "github.com/pingcap/br/pkg/summary" 35 "github.com/pingcap/br/pkg/utils" 36 ) 37 38 const ( 39 flagBackupTimeago = "timeago" 40 flagBackupTS = "backupts" 41 flagLastBackupTS = "lastbackupts" 42 flagCompressionType = "compression" 43 flagCompressionLevel = "compression-level" 44 flagRemoveSchedulers = "remove-schedulers" 45 flagIgnoreStats = "ignore-stats" 46 flagUseBackupMetaV2 = "use-backupmeta-v2" 47 48 flagGCTTL = "gcttl" 49 50 defaultBackupConcurrency = 4 51 maxBackupConcurrency = 256 52 ) 53 54 // CompressionConfig is the configuration for sst file compression. 55 type CompressionConfig struct { 56 CompressionType backuppb.CompressionType `json:"compression-type" toml:"compression-type"` 57 CompressionLevel int32 `json:"compression-level" toml:"compression-level"` 58 } 59 60 // BackupConfig is the configuration specific for backup tasks. 61 type BackupConfig struct { 62 Config 63 64 TimeAgo time.Duration `json:"time-ago" toml:"time-ago"` 65 BackupTS uint64 `json:"backup-ts" toml:"backup-ts"` 66 LastBackupTS uint64 `json:"last-backup-ts" toml:"last-backup-ts"` 67 GCTTL int64 `json:"gc-ttl" toml:"gc-ttl"` 68 RemoveSchedulers bool `json:"remove-schedulers" toml:"remove-schedulers"` 69 IgnoreStats bool `json:"ignore-stats" toml:"ignore-stats"` 70 UseBackupMetaV2 bool `json:"use-backupmeta-v2"` 71 CompressionConfig 72 } 73 74 // DefineBackupFlags defines common flags for the backup command. 75 func DefineBackupFlags(flags *pflag.FlagSet) { 76 flags.Duration( 77 flagBackupTimeago, 0, 78 "The history version of the backup task, e.g. 1m, 1h. Do not exceed GCSafePoint") 79 80 // TODO: remove experimental tag if it's stable 81 flags.Uint64(flagLastBackupTS, 0, "(experimental) the last time backup ts,"+ 82 " use for incremental backup, support TSO only") 83 flags.String(flagBackupTS, "", "the backup ts support TSO or datetime,"+ 84 " e.g. '400036290571534337', '2018-05-11 01:42:23'") 85 flags.Int64(flagGCTTL, utils.DefaultBRGCSafePointTTL, "the TTL (in seconds) that PD holds for BR's GC safepoint") 86 flags.String(flagCompressionType, "zstd", 87 "backup sst file compression algorithm, value can be one of 'lz4|zstd|snappy'") 88 flags.Int32(flagCompressionLevel, 0, "compression level used for sst file compression") 89 90 flags.Bool(flagRemoveSchedulers, false, 91 "disable the balance, shuffle and region-merge schedulers in PD to speed up backup") 92 // This flag can impact the online cluster, so hide it in case of abuse. 93 _ = flags.MarkHidden(flagRemoveSchedulers) 94 95 // Disable stats by default. because of 96 // 1. DumpStatsToJson is not stable 97 // 2. It increases memory usage and might cause BR OOM. 98 // TODO: we need a better way to backup/restore stats. 99 flags.Bool(flagIgnoreStats, true, "ignore backup stats, used for test") 100 // This flag is used for test. we should backup stats all the time. 101 _ = flags.MarkHidden(flagIgnoreStats) 102 103 flags.Bool(flagUseBackupMetaV2, false, 104 "use backup meta v2 to store meta info") 105 // This flag will change the structure of backupmeta. 106 // we must make sure the old three version of br can parse the v2 meta to keep compatibility. 107 // so this flag should set to false for three version by default. 108 // for example: 109 // if we put this feature in v4.0.14, then v4.0.14 br can parse v2 meta 110 // but will generate v1 meta due to this flag is false. the behaviour is as same as v4.0.15, v4.0.16. 111 // finally v4.0.17 will set this flag to true, and generate v2 meta. 112 _ = flags.MarkHidden(flagUseBackupMetaV2) 113 } 114 115 // ParseFromFlags parses the backup-related flags from the flag set. 116 func (cfg *BackupConfig) ParseFromFlags(flags *pflag.FlagSet) error { 117 timeAgo, err := flags.GetDuration(flagBackupTimeago) 118 if err != nil { 119 return errors.Trace(err) 120 } 121 if timeAgo < 0 { 122 return errors.Annotate(berrors.ErrInvalidArgument, "negative timeago is not allowed") 123 } 124 cfg.TimeAgo = timeAgo 125 cfg.LastBackupTS, err = flags.GetUint64(flagLastBackupTS) 126 if err != nil { 127 return errors.Trace(err) 128 } 129 backupTS, err := flags.GetString(flagBackupTS) 130 if err != nil { 131 return errors.Trace(err) 132 } 133 cfg.BackupTS, err = parseTSString(backupTS) 134 if err != nil { 135 return errors.Trace(err) 136 } 137 gcTTL, err := flags.GetInt64(flagGCTTL) 138 if err != nil { 139 return errors.Trace(err) 140 } 141 cfg.GCTTL = gcTTL 142 143 compressionCfg, err := parseCompressionFlags(flags) 144 if err != nil { 145 return errors.Trace(err) 146 } 147 cfg.CompressionConfig = *compressionCfg 148 149 if err = cfg.Config.ParseFromFlags(flags); err != nil { 150 return errors.Trace(err) 151 } 152 cfg.RemoveSchedulers, err = flags.GetBool(flagRemoveSchedulers) 153 if err != nil { 154 return errors.Trace(err) 155 } 156 cfg.IgnoreStats, err = flags.GetBool(flagIgnoreStats) 157 if err != nil { 158 return errors.Trace(err) 159 } 160 cfg.UseBackupMetaV2, err = flags.GetBool(flagUseBackupMetaV2) 161 return errors.Trace(err) 162 } 163 164 // ParseFromFlags parses the backup-related flags from the flag set. 165 func parseCompressionFlags(flags *pflag.FlagSet) (*CompressionConfig, error) { 166 compressionStr, err := flags.GetString(flagCompressionType) 167 if err != nil { 168 return nil, errors.Trace(err) 169 } 170 compressionType, err := parseCompressionType(compressionStr) 171 if err != nil { 172 return nil, errors.Trace(err) 173 } 174 level, err := flags.GetInt32(flagCompressionLevel) 175 if err != nil { 176 return nil, errors.Trace(err) 177 } 178 return &CompressionConfig{ 179 CompressionLevel: level, 180 CompressionType: compressionType, 181 }, nil 182 } 183 184 // adjustBackupConfig is use for BR(binary) and BR in TiDB. 185 // When new config was add and not included in parser. 186 // we should set proper value in this function. 187 // so that both binary and TiDB will use same default value. 188 func (cfg *BackupConfig) adjustBackupConfig() { 189 cfg.adjust() 190 usingDefaultConcurrency := false 191 if cfg.Config.Concurrency == 0 { 192 cfg.Config.Concurrency = defaultBackupConcurrency 193 usingDefaultConcurrency = true 194 } 195 if cfg.Config.Concurrency > maxBackupConcurrency { 196 cfg.Config.Concurrency = maxBackupConcurrency 197 } 198 if cfg.RateLimit != unlimited { 199 // TiKV limits the upload rate by each backup request. 200 // When the backup requests are sent concurrently, 201 // the ratelimit couldn't work as intended. 202 // Degenerating to sequentially sending backup requests to avoid this. 203 if !usingDefaultConcurrency { 204 logutil.WarnTerm("setting `--ratelimit` and `--concurrency` at the same time, "+ 205 "ignoring `--concurrency`: `--ratelimit` forces sequential (i.e. concurrency = 1) backup", 206 zap.String("ratelimit", units.HumanSize(float64(cfg.RateLimit))+"/s"), 207 zap.Uint32("concurrency-specified", cfg.Config.Concurrency)) 208 } 209 cfg.Config.Concurrency = 1 210 } 211 212 if cfg.GCTTL == 0 { 213 cfg.GCTTL = utils.DefaultBRGCSafePointTTL 214 } 215 // Use zstd as default 216 if cfg.CompressionType == backuppb.CompressionType_UNKNOWN { 217 cfg.CompressionType = backuppb.CompressionType_ZSTD 218 } 219 } 220 221 // RunBackup starts a backup task inside the current goroutine. 222 func RunBackup(c context.Context, g glue.Glue, cmdName string, cfg *BackupConfig) error { 223 cfg.adjustBackupConfig() 224 225 defer summary.Summary(cmdName) 226 ctx, cancel := context.WithCancel(c) 227 defer cancel() 228 229 if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil { 230 span1 := span.Tracer().StartSpan("task.RunBackup", opentracing.ChildOf(span.Context())) 231 defer span1.Finish() 232 ctx = opentracing.ContextWithSpan(ctx, span1) 233 } 234 235 u, err := storage.ParseBackend(cfg.Storage, &cfg.BackendOptions) 236 if err != nil { 237 return errors.Trace(err) 238 } 239 skipStats := cfg.IgnoreStats 240 // For backup, Domain is not needed if user ignores stats. 241 // Domain loads all table info into memory. By skipping Domain, we save 242 // lots of memory (about 500MB for 40K 40 fields YCSB tables). 243 needDomain := !skipStats 244 mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, GetKeepalive(&cfg.Config), cfg.CheckRequirements, needDomain) 245 if err != nil { 246 return errors.Trace(err) 247 } 248 defer mgr.Close() 249 var statsHandle *handle.Handle 250 if !skipStats { 251 statsHandle = mgr.GetDomain().StatsHandle() 252 } 253 254 client, err := backup.NewBackupClient(ctx, mgr) 255 if err != nil { 256 return errors.Trace(err) 257 } 258 opts := storage.ExternalStorageOptions{ 259 NoCredentials: cfg.NoCreds, 260 SendCredentials: cfg.SendCreds, 261 SkipCheckPath: cfg.SkipCheckPath, 262 } 263 if err = client.SetStorage(ctx, u, &opts); err != nil { 264 return errors.Trace(err) 265 } 266 err = client.SetLockFile(ctx) 267 if err != nil { 268 return errors.Trace(err) 269 } 270 client.SetGCTTL(cfg.GCTTL) 271 272 backupTS, err := client.GetTS(ctx, cfg.TimeAgo, cfg.BackupTS) 273 if err != nil { 274 return errors.Trace(err) 275 } 276 g.Record("BackupTS", backupTS) 277 sp := utils.BRServiceSafePoint{ 278 BackupTS: backupTS, 279 TTL: client.GetGCTTL(), 280 ID: utils.MakeSafePointID(), 281 } 282 // use lastBackupTS as safePoint if exists 283 if cfg.LastBackupTS > 0 { 284 sp.BackupTS = cfg.LastBackupTS 285 } 286 287 log.Info("current backup safePoint job", zap.Object("safePoint", sp)) 288 err = utils.StartServiceSafePointKeeper(ctx, mgr.GetPDClient(), sp) 289 if err != nil { 290 return errors.Trace(err) 291 } 292 293 isIncrementalBackup := cfg.LastBackupTS > 0 294 295 if cfg.RemoveSchedulers { 296 log.Debug("removing some PD schedulers") 297 restore, e := mgr.RemoveSchedulers(ctx) 298 defer func() { 299 if ctx.Err() != nil { 300 log.Warn("context canceled, doing clean work with background context") 301 ctx = context.Background() 302 } 303 if restoreE := restore(ctx); restoreE != nil { 304 log.Warn("failed to restore removed schedulers, you may need to restore them manually", zap.Error(restoreE)) 305 } 306 }() 307 if e != nil { 308 return errors.Trace(err) 309 } 310 } 311 312 req := backuppb.BackupRequest{ 313 ClusterId: client.GetClusterID(), 314 StartVersion: cfg.LastBackupTS, 315 EndVersion: backupTS, 316 RateLimit: cfg.RateLimit, 317 Concurrency: defaultBackupConcurrency, 318 CompressionType: cfg.CompressionType, 319 CompressionLevel: cfg.CompressionLevel, 320 } 321 brVersion := g.GetVersion() 322 clusterVersion, err := mgr.GetClusterVersion(ctx) 323 if err != nil { 324 return errors.Trace(err) 325 } 326 327 ranges, schemas, err := backup.BuildBackupRangeAndSchema(mgr.GetStorage(), cfg.TableFilter, backupTS) 328 if err != nil { 329 return errors.Trace(err) 330 } 331 332 // Metafile size should be less than 64MB. 333 metawriter := metautil.NewMetaWriter(client.GetStorage(), metautil.MetaFileSize, cfg.UseBackupMetaV2) 334 335 // nothing to backup 336 if ranges == nil { 337 // Hack way to update backupmeta. 338 metawriter.StartWriteMetasAsync(ctx, metautil.AppendSchema) 339 metawriter.Update(func(m *backuppb.BackupMeta) { 340 m.StartVersion = req.StartVersion 341 m.EndVersion = req.EndVersion 342 m.IsRawKv = req.IsRawKv 343 m.ClusterId = req.ClusterId 344 m.ClusterVersion = clusterVersion 345 m.BrVersion = brVersion 346 }) 347 pdAddress := strings.Join(cfg.PD, ",") 348 log.Warn("Nothing to backup, maybe connected to cluster for restoring", 349 zap.String("PD address", pdAddress)) 350 return metawriter.FinishWriteMetas(ctx, metautil.AppendSchema) 351 } 352 353 if isIncrementalBackup { 354 if backupTS <= cfg.LastBackupTS { 355 log.Error("LastBackupTS is larger or equal to current TS") 356 return errors.Annotate(berrors.ErrInvalidArgument, "LastBackupTS is larger or equal to current TS") 357 } 358 err = utils.CheckGCSafePoint(ctx, mgr.GetPDClient(), cfg.LastBackupTS) 359 if err != nil { 360 log.Error("Check gc safepoint for last backup ts failed", zap.Error(err)) 361 return errors.Trace(err) 362 } 363 364 metawriter.StartWriteMetasAsync(ctx, metautil.AppendDDL) 365 err = backup.WriteBackupDDLJobs(metawriter, mgr.GetStorage(), cfg.LastBackupTS, backupTS) 366 if err != nil { 367 return errors.Trace(err) 368 } 369 if err = metawriter.FinishWriteMetas(ctx, metautil.AppendDDL); err != nil { 370 return errors.Trace(err) 371 } 372 } 373 374 summary.CollectInt("backup total ranges", len(ranges)) 375 376 var updateCh glue.Progress 377 var unit backup.ProgressUnit 378 if len(ranges) < 100 { 379 unit = backup.RegionUnit 380 // The number of regions need to backup 381 approximateRegions := 0 382 for _, r := range ranges { 383 var regionCount int 384 regionCount, err = mgr.GetRegionCount(ctx, r.StartKey, r.EndKey) 385 if err != nil { 386 return errors.Trace(err) 387 } 388 approximateRegions += regionCount 389 } 390 // Redirect to log if there is no log file to avoid unreadable output. 391 updateCh = g.StartProgress( 392 ctx, cmdName, int64(approximateRegions), !cfg.LogProgress) 393 summary.CollectInt("backup total regions", approximateRegions) 394 } else { 395 unit = backup.RangeUnit 396 // To reduce the costs, we can use the range as unit of progress. 397 updateCh = g.StartProgress( 398 ctx, cmdName, int64(len(ranges)), !cfg.LogProgress) 399 } 400 401 progressCount := 0 402 progressCallBack := func(callBackUnit backup.ProgressUnit) { 403 if unit == callBackUnit { 404 updateCh.Inc() 405 progressCount++ 406 failpoint.Inject("progress-call-back", func(v failpoint.Value) { 407 log.Info("failpoint progress-call-back injected") 408 if fileName, ok := v.(string); ok { 409 f, osErr := os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY, os.ModePerm) 410 if osErr != nil { 411 log.Warn("failed to create file", zap.Error(osErr)) 412 } 413 msg := []byte(fmt.Sprintf("%s:%d\n", unit, progressCount)) 414 _, err = f.Write(msg) 415 if err != nil { 416 log.Warn("failed to write data to file", zap.Error(err)) 417 } 418 } 419 }) 420 } 421 } 422 metawriter.StartWriteMetasAsync(ctx, metautil.AppendDataFile) 423 err = client.BackupRanges(ctx, ranges, req, uint(cfg.Concurrency), metawriter, progressCallBack) 424 if err != nil { 425 return errors.Trace(err) 426 } 427 // Backup has finished 428 updateCh.Close() 429 430 err = metawriter.FinishWriteMetas(ctx, metautil.AppendDataFile) 431 if err != nil { 432 return errors.Trace(err) 433 } 434 435 metawriter.Update(func(m *backuppb.BackupMeta) { 436 m.StartVersion = req.StartVersion 437 m.EndVersion = req.EndVersion 438 m.IsRawKv = req.IsRawKv 439 m.ClusterId = req.ClusterId 440 m.ClusterVersion = clusterVersion 441 m.BrVersion = brVersion 442 }) 443 444 skipChecksum := !cfg.Checksum || isIncrementalBackup 445 checksumProgress := int64(schemas.Len()) 446 if skipChecksum { 447 checksumProgress = 1 448 if isIncrementalBackup { 449 // Since we don't support checksum for incremental data, fast checksum should be skipped. 450 log.Info("Skip fast checksum in incremental backup") 451 } else { 452 // When user specified not to calculate checksum, don't calculate checksum. 453 log.Info("Skip fast checksum") 454 } 455 } 456 updateCh = g.StartProgress(ctx, "Checksum", checksumProgress, !cfg.LogProgress) 457 schemasConcurrency := uint(utils.MinInt(backup.DefaultSchemaConcurrency, schemas.Len())) 458 459 err = schemas.BackupSchemas( 460 ctx, metawriter, mgr.GetStorage(), statsHandle, backupTS, schemasConcurrency, cfg.ChecksumConcurrency, skipChecksum, updateCh) 461 if err != nil { 462 return errors.Trace(err) 463 } 464 // Checksum has finished, close checksum progress. 465 updateCh.Close() 466 467 if !skipChecksum { 468 // Check if checksum from files matches checksum from coprocessor. 469 err = checksum.FastChecksum(ctx, metawriter.Backupmeta(), client.GetStorage()) 470 if err != nil { 471 return errors.Trace(err) 472 } 473 } 474 475 g.Record(summary.BackupDataSize, metawriter.ArchiveSize()) 476 failpoint.Inject("s3-outage-during-writing-file", func(v failpoint.Value) { 477 log.Info("failpoint s3-outage-during-writing-file injected, " + 478 "process will sleep for 3s and notify the shell to kill s3 service.") 479 if sigFile, ok := v.(string); ok { 480 file, err := os.Create(sigFile) 481 if err != nil { 482 log.Warn("failed to create file for notifying, skipping notify", zap.Error(err)) 483 } 484 if file != nil { 485 file.Close() 486 } 487 } 488 time.Sleep(3 * time.Second) 489 }) 490 // Set task summary to success status. 491 summary.SetSuccessStatus(true) 492 return nil 493 } 494 495 // parseTSString port from tidb setSnapshotTS. 496 func parseTSString(ts string) (uint64, error) { 497 if len(ts) == 0 { 498 return 0, nil 499 } 500 if tso, err := strconv.ParseUint(ts, 10, 64); err == nil { 501 return tso, nil 502 } 503 504 loc := time.Local 505 sc := &stmtctx.StatementContext{ 506 TimeZone: loc, 507 } 508 t, err := types.ParseTime(sc, ts, mysql.TypeTimestamp, types.MaxFsp) 509 if err != nil { 510 return 0, errors.Trace(err) 511 } 512 t1, err := t.GoTime(loc) 513 if err != nil { 514 return 0, errors.Trace(err) 515 } 516 return oracle.GoTimeToTS(t1), nil 517 } 518 519 func parseCompressionType(s string) (backuppb.CompressionType, error) { 520 var ct backuppb.CompressionType 521 switch s { 522 case "lz4": 523 ct = backuppb.CompressionType_LZ4 524 case "snappy": 525 ct = backuppb.CompressionType_SNAPPY 526 case "zstd": 527 ct = backuppb.CompressionType_ZSTD 528 default: 529 return backuppb.CompressionType_UNKNOWN, errors.Annotatef(berrors.ErrInvalidArgument, "invalid compression type '%s'", s) 530 } 531 return ct, nil 532 }