github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/task/backup.go (about)

     1  // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.
     2  
     3  package task
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"os"
     9  	"strconv"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/docker/go-units"
    14  	"github.com/opentracing/opentracing-go"
    15  	"github.com/pingcap/errors"
    16  	"github.com/pingcap/failpoint"
    17  	backuppb "github.com/pingcap/kvproto/pkg/backup"
    18  	"github.com/pingcap/log"
    19  	"github.com/pingcap/parser/mysql"
    20  	"github.com/pingcap/tidb/sessionctx/stmtctx"
    21  	"github.com/pingcap/tidb/statistics/handle"
    22  	"github.com/pingcap/tidb/types"
    23  	"github.com/spf13/pflag"
    24  	"github.com/tikv/client-go/v2/oracle"
    25  	"go.uber.org/zap"
    26  
    27  	"github.com/pingcap/br/pkg/backup"
    28  	"github.com/pingcap/br/pkg/checksum"
    29  	berrors "github.com/pingcap/br/pkg/errors"
    30  	"github.com/pingcap/br/pkg/glue"
    31  	"github.com/pingcap/br/pkg/logutil"
    32  	"github.com/pingcap/br/pkg/metautil"
    33  	"github.com/pingcap/br/pkg/storage"
    34  	"github.com/pingcap/br/pkg/summary"
    35  	"github.com/pingcap/br/pkg/utils"
    36  )
    37  
    38  const (
    39  	flagBackupTimeago    = "timeago"
    40  	flagBackupTS         = "backupts"
    41  	flagLastBackupTS     = "lastbackupts"
    42  	flagCompressionType  = "compression"
    43  	flagCompressionLevel = "compression-level"
    44  	flagRemoveSchedulers = "remove-schedulers"
    45  	flagIgnoreStats      = "ignore-stats"
    46  	flagUseBackupMetaV2  = "use-backupmeta-v2"
    47  
    48  	flagGCTTL = "gcttl"
    49  
    50  	defaultBackupConcurrency = 4
    51  	maxBackupConcurrency     = 256
    52  )
    53  
    54  // CompressionConfig is the configuration for sst file compression.
    55  type CompressionConfig struct {
    56  	CompressionType  backuppb.CompressionType `json:"compression-type" toml:"compression-type"`
    57  	CompressionLevel int32                    `json:"compression-level" toml:"compression-level"`
    58  }
    59  
    60  // BackupConfig is the configuration specific for backup tasks.
    61  type BackupConfig struct {
    62  	Config
    63  
    64  	TimeAgo          time.Duration `json:"time-ago" toml:"time-ago"`
    65  	BackupTS         uint64        `json:"backup-ts" toml:"backup-ts"`
    66  	LastBackupTS     uint64        `json:"last-backup-ts" toml:"last-backup-ts"`
    67  	GCTTL            int64         `json:"gc-ttl" toml:"gc-ttl"`
    68  	RemoveSchedulers bool          `json:"remove-schedulers" toml:"remove-schedulers"`
    69  	IgnoreStats      bool          `json:"ignore-stats" toml:"ignore-stats"`
    70  	UseBackupMetaV2  bool          `json:"use-backupmeta-v2"`
    71  	CompressionConfig
    72  }
    73  
    74  // DefineBackupFlags defines common flags for the backup command.
    75  func DefineBackupFlags(flags *pflag.FlagSet) {
    76  	flags.Duration(
    77  		flagBackupTimeago, 0,
    78  		"The history version of the backup task, e.g. 1m, 1h. Do not exceed GCSafePoint")
    79  
    80  	// TODO: remove experimental tag if it's stable
    81  	flags.Uint64(flagLastBackupTS, 0, "(experimental) the last time backup ts,"+
    82  		" use for incremental backup, support TSO only")
    83  	flags.String(flagBackupTS, "", "the backup ts support TSO or datetime,"+
    84  		" e.g. '400036290571534337', '2018-05-11 01:42:23'")
    85  	flags.Int64(flagGCTTL, utils.DefaultBRGCSafePointTTL, "the TTL (in seconds) that PD holds for BR's GC safepoint")
    86  	flags.String(flagCompressionType, "zstd",
    87  		"backup sst file compression algorithm, value can be one of 'lz4|zstd|snappy'")
    88  	flags.Int32(flagCompressionLevel, 0, "compression level used for sst file compression")
    89  
    90  	flags.Bool(flagRemoveSchedulers, false,
    91  		"disable the balance, shuffle and region-merge schedulers in PD to speed up backup")
    92  	// This flag can impact the online cluster, so hide it in case of abuse.
    93  	_ = flags.MarkHidden(flagRemoveSchedulers)
    94  
    95  	// Disable stats by default. because of
    96  	// 1. DumpStatsToJson is not stable
    97  	// 2. It increases memory usage and might cause BR OOM.
    98  	// TODO: we need a better way to backup/restore stats.
    99  	flags.Bool(flagIgnoreStats, true, "ignore backup stats, used for test")
   100  	// This flag is used for test. we should backup stats all the time.
   101  	_ = flags.MarkHidden(flagIgnoreStats)
   102  
   103  	flags.Bool(flagUseBackupMetaV2, false,
   104  		"use backup meta v2 to store meta info")
   105  	// This flag will change the structure of backupmeta.
   106  	// we must make sure the old three version of br can parse the v2 meta to keep compatibility.
   107  	// so this flag should set to false for three version by default.
   108  	// for example:
   109  	// if we put this feature in v4.0.14, then v4.0.14 br can parse v2 meta
   110  	// but will generate v1 meta due to this flag is false. the behaviour is as same as v4.0.15, v4.0.16.
   111  	// finally v4.0.17 will set this flag to true, and generate v2 meta.
   112  	_ = flags.MarkHidden(flagUseBackupMetaV2)
   113  }
   114  
   115  // ParseFromFlags parses the backup-related flags from the flag set.
   116  func (cfg *BackupConfig) ParseFromFlags(flags *pflag.FlagSet) error {
   117  	timeAgo, err := flags.GetDuration(flagBackupTimeago)
   118  	if err != nil {
   119  		return errors.Trace(err)
   120  	}
   121  	if timeAgo < 0 {
   122  		return errors.Annotate(berrors.ErrInvalidArgument, "negative timeago is not allowed")
   123  	}
   124  	cfg.TimeAgo = timeAgo
   125  	cfg.LastBackupTS, err = flags.GetUint64(flagLastBackupTS)
   126  	if err != nil {
   127  		return errors.Trace(err)
   128  	}
   129  	backupTS, err := flags.GetString(flagBackupTS)
   130  	if err != nil {
   131  		return errors.Trace(err)
   132  	}
   133  	cfg.BackupTS, err = parseTSString(backupTS)
   134  	if err != nil {
   135  		return errors.Trace(err)
   136  	}
   137  	gcTTL, err := flags.GetInt64(flagGCTTL)
   138  	if err != nil {
   139  		return errors.Trace(err)
   140  	}
   141  	cfg.GCTTL = gcTTL
   142  
   143  	compressionCfg, err := parseCompressionFlags(flags)
   144  	if err != nil {
   145  		return errors.Trace(err)
   146  	}
   147  	cfg.CompressionConfig = *compressionCfg
   148  
   149  	if err = cfg.Config.ParseFromFlags(flags); err != nil {
   150  		return errors.Trace(err)
   151  	}
   152  	cfg.RemoveSchedulers, err = flags.GetBool(flagRemoveSchedulers)
   153  	if err != nil {
   154  		return errors.Trace(err)
   155  	}
   156  	cfg.IgnoreStats, err = flags.GetBool(flagIgnoreStats)
   157  	if err != nil {
   158  		return errors.Trace(err)
   159  	}
   160  	cfg.UseBackupMetaV2, err = flags.GetBool(flagUseBackupMetaV2)
   161  	return errors.Trace(err)
   162  }
   163  
   164  // ParseFromFlags parses the backup-related flags from the flag set.
   165  func parseCompressionFlags(flags *pflag.FlagSet) (*CompressionConfig, error) {
   166  	compressionStr, err := flags.GetString(flagCompressionType)
   167  	if err != nil {
   168  		return nil, errors.Trace(err)
   169  	}
   170  	compressionType, err := parseCompressionType(compressionStr)
   171  	if err != nil {
   172  		return nil, errors.Trace(err)
   173  	}
   174  	level, err := flags.GetInt32(flagCompressionLevel)
   175  	if err != nil {
   176  		return nil, errors.Trace(err)
   177  	}
   178  	return &CompressionConfig{
   179  		CompressionLevel: level,
   180  		CompressionType:  compressionType,
   181  	}, nil
   182  }
   183  
   184  // adjustBackupConfig is use for BR(binary) and BR in TiDB.
   185  // When new config was add and not included in parser.
   186  // we should set proper value in this function.
   187  // so that both binary and TiDB will use same default value.
   188  func (cfg *BackupConfig) adjustBackupConfig() {
   189  	cfg.adjust()
   190  	usingDefaultConcurrency := false
   191  	if cfg.Config.Concurrency == 0 {
   192  		cfg.Config.Concurrency = defaultBackupConcurrency
   193  		usingDefaultConcurrency = true
   194  	}
   195  	if cfg.Config.Concurrency > maxBackupConcurrency {
   196  		cfg.Config.Concurrency = maxBackupConcurrency
   197  	}
   198  	if cfg.RateLimit != unlimited {
   199  		// TiKV limits the upload rate by each backup request.
   200  		// When the backup requests are sent concurrently,
   201  		// the ratelimit couldn't work as intended.
   202  		// Degenerating to sequentially sending backup requests to avoid this.
   203  		if !usingDefaultConcurrency {
   204  			logutil.WarnTerm("setting `--ratelimit` and `--concurrency` at the same time, "+
   205  				"ignoring `--concurrency`: `--ratelimit` forces sequential (i.e. concurrency = 1) backup",
   206  				zap.String("ratelimit", units.HumanSize(float64(cfg.RateLimit))+"/s"),
   207  				zap.Uint32("concurrency-specified", cfg.Config.Concurrency))
   208  		}
   209  		cfg.Config.Concurrency = 1
   210  	}
   211  
   212  	if cfg.GCTTL == 0 {
   213  		cfg.GCTTL = utils.DefaultBRGCSafePointTTL
   214  	}
   215  	// Use zstd as default
   216  	if cfg.CompressionType == backuppb.CompressionType_UNKNOWN {
   217  		cfg.CompressionType = backuppb.CompressionType_ZSTD
   218  	}
   219  }
   220  
   221  // RunBackup starts a backup task inside the current goroutine.
   222  func RunBackup(c context.Context, g glue.Glue, cmdName string, cfg *BackupConfig) error {
   223  	cfg.adjustBackupConfig()
   224  
   225  	defer summary.Summary(cmdName)
   226  	ctx, cancel := context.WithCancel(c)
   227  	defer cancel()
   228  
   229  	if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil {
   230  		span1 := span.Tracer().StartSpan("task.RunBackup", opentracing.ChildOf(span.Context()))
   231  		defer span1.Finish()
   232  		ctx = opentracing.ContextWithSpan(ctx, span1)
   233  	}
   234  
   235  	u, err := storage.ParseBackend(cfg.Storage, &cfg.BackendOptions)
   236  	if err != nil {
   237  		return errors.Trace(err)
   238  	}
   239  	skipStats := cfg.IgnoreStats
   240  	// For backup, Domain is not needed if user ignores stats.
   241  	// Domain loads all table info into memory. By skipping Domain, we save
   242  	// lots of memory (about 500MB for 40K 40 fields YCSB tables).
   243  	needDomain := !skipStats
   244  	mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, GetKeepalive(&cfg.Config), cfg.CheckRequirements, needDomain)
   245  	if err != nil {
   246  		return errors.Trace(err)
   247  	}
   248  	defer mgr.Close()
   249  	var statsHandle *handle.Handle
   250  	if !skipStats {
   251  		statsHandle = mgr.GetDomain().StatsHandle()
   252  	}
   253  
   254  	client, err := backup.NewBackupClient(ctx, mgr)
   255  	if err != nil {
   256  		return errors.Trace(err)
   257  	}
   258  	opts := storage.ExternalStorageOptions{
   259  		NoCredentials:   cfg.NoCreds,
   260  		SendCredentials: cfg.SendCreds,
   261  		SkipCheckPath:   cfg.SkipCheckPath,
   262  	}
   263  	if err = client.SetStorage(ctx, u, &opts); err != nil {
   264  		return errors.Trace(err)
   265  	}
   266  	err = client.SetLockFile(ctx)
   267  	if err != nil {
   268  		return errors.Trace(err)
   269  	}
   270  	client.SetGCTTL(cfg.GCTTL)
   271  
   272  	backupTS, err := client.GetTS(ctx, cfg.TimeAgo, cfg.BackupTS)
   273  	if err != nil {
   274  		return errors.Trace(err)
   275  	}
   276  	g.Record("BackupTS", backupTS)
   277  	sp := utils.BRServiceSafePoint{
   278  		BackupTS: backupTS,
   279  		TTL:      client.GetGCTTL(),
   280  		ID:       utils.MakeSafePointID(),
   281  	}
   282  	// use lastBackupTS as safePoint if exists
   283  	if cfg.LastBackupTS > 0 {
   284  		sp.BackupTS = cfg.LastBackupTS
   285  	}
   286  
   287  	log.Info("current backup safePoint job", zap.Object("safePoint", sp))
   288  	err = utils.StartServiceSafePointKeeper(ctx, mgr.GetPDClient(), sp)
   289  	if err != nil {
   290  		return errors.Trace(err)
   291  	}
   292  
   293  	isIncrementalBackup := cfg.LastBackupTS > 0
   294  
   295  	if cfg.RemoveSchedulers {
   296  		log.Debug("removing some PD schedulers")
   297  		restore, e := mgr.RemoveSchedulers(ctx)
   298  		defer func() {
   299  			if ctx.Err() != nil {
   300  				log.Warn("context canceled, doing clean work with background context")
   301  				ctx = context.Background()
   302  			}
   303  			if restoreE := restore(ctx); restoreE != nil {
   304  				log.Warn("failed to restore removed schedulers, you may need to restore them manually", zap.Error(restoreE))
   305  			}
   306  		}()
   307  		if e != nil {
   308  			return errors.Trace(err)
   309  		}
   310  	}
   311  
   312  	req := backuppb.BackupRequest{
   313  		ClusterId:        client.GetClusterID(),
   314  		StartVersion:     cfg.LastBackupTS,
   315  		EndVersion:       backupTS,
   316  		RateLimit:        cfg.RateLimit,
   317  		Concurrency:      defaultBackupConcurrency,
   318  		CompressionType:  cfg.CompressionType,
   319  		CompressionLevel: cfg.CompressionLevel,
   320  	}
   321  	brVersion := g.GetVersion()
   322  	clusterVersion, err := mgr.GetClusterVersion(ctx)
   323  	if err != nil {
   324  		return errors.Trace(err)
   325  	}
   326  
   327  	ranges, schemas, err := backup.BuildBackupRangeAndSchema(mgr.GetStorage(), cfg.TableFilter, backupTS)
   328  	if err != nil {
   329  		return errors.Trace(err)
   330  	}
   331  
   332  	// Metafile size should be less than 64MB.
   333  	metawriter := metautil.NewMetaWriter(client.GetStorage(), metautil.MetaFileSize, cfg.UseBackupMetaV2)
   334  
   335  	// nothing to backup
   336  	if ranges == nil {
   337  		// Hack way to update backupmeta.
   338  		metawriter.StartWriteMetasAsync(ctx, metautil.AppendSchema)
   339  		metawriter.Update(func(m *backuppb.BackupMeta) {
   340  			m.StartVersion = req.StartVersion
   341  			m.EndVersion = req.EndVersion
   342  			m.IsRawKv = req.IsRawKv
   343  			m.ClusterId = req.ClusterId
   344  			m.ClusterVersion = clusterVersion
   345  			m.BrVersion = brVersion
   346  		})
   347  		pdAddress := strings.Join(cfg.PD, ",")
   348  		log.Warn("Nothing to backup, maybe connected to cluster for restoring",
   349  			zap.String("PD address", pdAddress))
   350  		return metawriter.FinishWriteMetas(ctx, metautil.AppendSchema)
   351  	}
   352  
   353  	if isIncrementalBackup {
   354  		if backupTS <= cfg.LastBackupTS {
   355  			log.Error("LastBackupTS is larger or equal to current TS")
   356  			return errors.Annotate(berrors.ErrInvalidArgument, "LastBackupTS is larger or equal to current TS")
   357  		}
   358  		err = utils.CheckGCSafePoint(ctx, mgr.GetPDClient(), cfg.LastBackupTS)
   359  		if err != nil {
   360  			log.Error("Check gc safepoint for last backup ts failed", zap.Error(err))
   361  			return errors.Trace(err)
   362  		}
   363  
   364  		metawriter.StartWriteMetasAsync(ctx, metautil.AppendDDL)
   365  		err = backup.WriteBackupDDLJobs(metawriter, mgr.GetStorage(), cfg.LastBackupTS, backupTS)
   366  		if err != nil {
   367  			return errors.Trace(err)
   368  		}
   369  		if err = metawriter.FinishWriteMetas(ctx, metautil.AppendDDL); err != nil {
   370  			return errors.Trace(err)
   371  		}
   372  	}
   373  
   374  	summary.CollectInt("backup total ranges", len(ranges))
   375  
   376  	var updateCh glue.Progress
   377  	var unit backup.ProgressUnit
   378  	if len(ranges) < 100 {
   379  		unit = backup.RegionUnit
   380  		// The number of regions need to backup
   381  		approximateRegions := 0
   382  		for _, r := range ranges {
   383  			var regionCount int
   384  			regionCount, err = mgr.GetRegionCount(ctx, r.StartKey, r.EndKey)
   385  			if err != nil {
   386  				return errors.Trace(err)
   387  			}
   388  			approximateRegions += regionCount
   389  		}
   390  		// Redirect to log if there is no log file to avoid unreadable output.
   391  		updateCh = g.StartProgress(
   392  			ctx, cmdName, int64(approximateRegions), !cfg.LogProgress)
   393  		summary.CollectInt("backup total regions", approximateRegions)
   394  	} else {
   395  		unit = backup.RangeUnit
   396  		// To reduce the costs, we can use the range as unit of progress.
   397  		updateCh = g.StartProgress(
   398  			ctx, cmdName, int64(len(ranges)), !cfg.LogProgress)
   399  	}
   400  
   401  	progressCount := 0
   402  	progressCallBack := func(callBackUnit backup.ProgressUnit) {
   403  		if unit == callBackUnit {
   404  			updateCh.Inc()
   405  			progressCount++
   406  			failpoint.Inject("progress-call-back", func(v failpoint.Value) {
   407  				log.Info("failpoint progress-call-back injected")
   408  				if fileName, ok := v.(string); ok {
   409  					f, osErr := os.OpenFile(fileName, os.O_CREATE|os.O_WRONLY, os.ModePerm)
   410  					if osErr != nil {
   411  						log.Warn("failed to create file", zap.Error(osErr))
   412  					}
   413  					msg := []byte(fmt.Sprintf("%s:%d\n", unit, progressCount))
   414  					_, err = f.Write(msg)
   415  					if err != nil {
   416  						log.Warn("failed to write data to file", zap.Error(err))
   417  					}
   418  				}
   419  			})
   420  		}
   421  	}
   422  	metawriter.StartWriteMetasAsync(ctx, metautil.AppendDataFile)
   423  	err = client.BackupRanges(ctx, ranges, req, uint(cfg.Concurrency), metawriter, progressCallBack)
   424  	if err != nil {
   425  		return errors.Trace(err)
   426  	}
   427  	// Backup has finished
   428  	updateCh.Close()
   429  
   430  	err = metawriter.FinishWriteMetas(ctx, metautil.AppendDataFile)
   431  	if err != nil {
   432  		return errors.Trace(err)
   433  	}
   434  
   435  	metawriter.Update(func(m *backuppb.BackupMeta) {
   436  		m.StartVersion = req.StartVersion
   437  		m.EndVersion = req.EndVersion
   438  		m.IsRawKv = req.IsRawKv
   439  		m.ClusterId = req.ClusterId
   440  		m.ClusterVersion = clusterVersion
   441  		m.BrVersion = brVersion
   442  	})
   443  
   444  	skipChecksum := !cfg.Checksum || isIncrementalBackup
   445  	checksumProgress := int64(schemas.Len())
   446  	if skipChecksum {
   447  		checksumProgress = 1
   448  		if isIncrementalBackup {
   449  			// Since we don't support checksum for incremental data, fast checksum should be skipped.
   450  			log.Info("Skip fast checksum in incremental backup")
   451  		} else {
   452  			// When user specified not to calculate checksum, don't calculate checksum.
   453  			log.Info("Skip fast checksum")
   454  		}
   455  	}
   456  	updateCh = g.StartProgress(ctx, "Checksum", checksumProgress, !cfg.LogProgress)
   457  	schemasConcurrency := uint(utils.MinInt(backup.DefaultSchemaConcurrency, schemas.Len()))
   458  
   459  	err = schemas.BackupSchemas(
   460  		ctx, metawriter, mgr.GetStorage(), statsHandle, backupTS, schemasConcurrency, cfg.ChecksumConcurrency, skipChecksum, updateCh)
   461  	if err != nil {
   462  		return errors.Trace(err)
   463  	}
   464  	// Checksum has finished, close checksum progress.
   465  	updateCh.Close()
   466  
   467  	if !skipChecksum {
   468  		// Check if checksum from files matches checksum from coprocessor.
   469  		err = checksum.FastChecksum(ctx, metawriter.Backupmeta(), client.GetStorage())
   470  		if err != nil {
   471  			return errors.Trace(err)
   472  		}
   473  	}
   474  
   475  	g.Record(summary.BackupDataSize, metawriter.ArchiveSize())
   476  	failpoint.Inject("s3-outage-during-writing-file", func(v failpoint.Value) {
   477  		log.Info("failpoint s3-outage-during-writing-file injected, " +
   478  			"process will sleep for 3s and notify the shell to kill s3 service.")
   479  		if sigFile, ok := v.(string); ok {
   480  			file, err := os.Create(sigFile)
   481  			if err != nil {
   482  				log.Warn("failed to create file for notifying, skipping notify", zap.Error(err))
   483  			}
   484  			if file != nil {
   485  				file.Close()
   486  			}
   487  		}
   488  		time.Sleep(3 * time.Second)
   489  	})
   490  	// Set task summary to success status.
   491  	summary.SetSuccessStatus(true)
   492  	return nil
   493  }
   494  
   495  // parseTSString port from tidb setSnapshotTS.
   496  func parseTSString(ts string) (uint64, error) {
   497  	if len(ts) == 0 {
   498  		return 0, nil
   499  	}
   500  	if tso, err := strconv.ParseUint(ts, 10, 64); err == nil {
   501  		return tso, nil
   502  	}
   503  
   504  	loc := time.Local
   505  	sc := &stmtctx.StatementContext{
   506  		TimeZone: loc,
   507  	}
   508  	t, err := types.ParseTime(sc, ts, mysql.TypeTimestamp, types.MaxFsp)
   509  	if err != nil {
   510  		return 0, errors.Trace(err)
   511  	}
   512  	t1, err := t.GoTime(loc)
   513  	if err != nil {
   514  		return 0, errors.Trace(err)
   515  	}
   516  	return oracle.GoTimeToTS(t1), nil
   517  }
   518  
   519  func parseCompressionType(s string) (backuppb.CompressionType, error) {
   520  	var ct backuppb.CompressionType
   521  	switch s {
   522  	case "lz4":
   523  		ct = backuppb.CompressionType_LZ4
   524  	case "snappy":
   525  		ct = backuppb.CompressionType_SNAPPY
   526  	case "zstd":
   527  		ct = backuppb.CompressionType_ZSTD
   528  	default:
   529  		return backuppb.CompressionType_UNKNOWN, errors.Annotatef(berrors.ErrInvalidArgument, "invalid compression type '%s'", s)
   530  	}
   531  	return ct, nil
   532  }