github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/pdutil/pd.go (about)

     1  // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.
     2  
     3  package pdutil
     4  
     5  import (
     6  	"bytes"
     7  	"context"
     8  	"crypto/tls"
     9  	"encoding/json"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"net/http"
    14  	"net/url"
    15  	"strings"
    16  	"time"
    17  
    18  	"github.com/coreos/go-semver/semver"
    19  	"github.com/docker/go-units"
    20  	"github.com/opentracing/opentracing-go"
    21  	"github.com/pingcap/errors"
    22  	"github.com/pingcap/failpoint"
    23  	"github.com/pingcap/log"
    24  	"github.com/pingcap/tidb/util/codec"
    25  	pd "github.com/tikv/pd/client"
    26  	pdapi "github.com/tikv/pd/server/api"
    27  	"go.uber.org/zap"
    28  	"google.golang.org/grpc"
    29  
    30  	berrors "github.com/pingcap/br/pkg/errors"
    31  	"github.com/pingcap/br/pkg/httputil"
    32  	"github.com/pingcap/br/pkg/lightning/common"
    33  )
    34  
    35  const (
    36  	clusterVersionPrefix = "pd/api/v1/config/cluster-version"
    37  	regionCountPrefix    = "pd/api/v1/stats/region"
    38  	storePrefix          = "pd/api/v1/store"
    39  	schedulerPrefix      = "pd/api/v1/schedulers"
    40  	maxMsgSize           = int(128 * units.MiB) // pd.ScanRegion may return a large response
    41  	scheduleConfigPrefix = "pd/api/v1/config/schedule"
    42  	pauseTimeout         = 5 * time.Minute
    43  
    44  	// pd request retry time when connection fail
    45  	pdRequestRetryTime = 10
    46  
    47  	// set max-pending-peer-count to a large value to avoid scatter region failed.
    48  	maxPendingPeerUnlimited uint64 = math.MaxInt32
    49  )
    50  
    51  // pauseConfigGenerator generate a config value according to store count and current value.
    52  type pauseConfigGenerator func(int, interface{}) interface{}
    53  
    54  // zeroPauseConfig sets the config to 0.
    55  func zeroPauseConfig(int, interface{}) interface{} {
    56  	return 0
    57  }
    58  
    59  // pauseConfigMulStores multiplies the existing value by
    60  // number of stores. The value is limited to 40, as larger value
    61  // may make the cluster unstable.
    62  func pauseConfigMulStores(stores int, raw interface{}) interface{} {
    63  	rawCfg := raw.(float64)
    64  	return math.Min(40, rawCfg*float64(stores))
    65  }
    66  
    67  // pauseConfigFalse sets the config to "false".
    68  func pauseConfigFalse(int, interface{}) interface{} {
    69  	return "false"
    70  }
    71  
    72  // constConfigGeneratorBuilder build a pauseConfigGenerator based on a given const value.
    73  func constConfigGeneratorBuilder(val interface{}) pauseConfigGenerator {
    74  	return func(int, interface{}) interface{} {
    75  		return val
    76  	}
    77  }
    78  
    79  // ClusterConfig represents a set of scheduler whose config have been modified
    80  // along with their original config.
    81  type ClusterConfig struct {
    82  	// Enable PD schedulers before restore
    83  	Schedulers []string `json:"schedulers"`
    84  	// Original scheudle configuration
    85  	ScheduleCfg map[string]interface{} `json:"schedule_cfg"`
    86  }
    87  
    88  type pauseSchedulerBody struct {
    89  	Delay int64 `json:"delay"`
    90  }
    91  
    92  var (
    93  	// in v4.0.8 version we can use pause configs
    94  	// see https://github.com/tikv/pd/pull/3088
    95  	pauseConfigVersion = semver.Version{Major: 4, Minor: 0, Patch: 8}
    96  
    97  	// Schedulers represent region/leader schedulers which can impact on performance.
    98  	Schedulers = map[string]struct{}{
    99  		"balance-leader-scheduler":     {},
   100  		"balance-hot-region-scheduler": {},
   101  		"balance-region-scheduler":     {},
   102  
   103  		"shuffle-leader-scheduler":     {},
   104  		"shuffle-region-scheduler":     {},
   105  		"shuffle-hot-region-scheduler": {},
   106  	}
   107  	expectPDCfg = map[string]pauseConfigGenerator{
   108  		"max-merge-region-keys": zeroPauseConfig,
   109  		"max-merge-region-size": zeroPauseConfig,
   110  		// TODO "leader-schedule-limit" and "region-schedule-limit" don't support ttl for now,
   111  		// but we still need set these config for compatible with old version.
   112  		// we need wait for https://github.com/tikv/pd/pull/3131 merged.
   113  		// see details https://github.com/pingcap/br/pull/592#discussion_r522684325
   114  		"leader-schedule-limit":       pauseConfigMulStores,
   115  		"region-schedule-limit":       pauseConfigMulStores,
   116  		"max-snapshot-count":          pauseConfigMulStores,
   117  		"enable-location-replacement": pauseConfigFalse,
   118  		"max-pending-peer-count":      constConfigGeneratorBuilder(maxPendingPeerUnlimited),
   119  	}
   120  
   121  	// defaultPDCfg find by https://github.com/tikv/pd/blob/master/conf/config.toml.
   122  	defaultPDCfg = map[string]interface{}{
   123  		"max-merge-region-keys":       200000,
   124  		"max-merge-region-size":       20,
   125  		"leader-schedule-limit":       4,
   126  		"region-schedule-limit":       2048,
   127  		"enable-location-replacement": "true",
   128  	}
   129  )
   130  
   131  // pdHTTPRequest defines the interface to send a request to pd and return the result in bytes.
   132  type pdHTTPRequest func(context.Context, string, string, *http.Client, string, io.Reader) ([]byte, error)
   133  
   134  // pdRequest is a func to send a HTTP to pd and return the result bytes.
   135  func pdRequest(
   136  	ctx context.Context,
   137  	addr string, prefix string,
   138  	cli *http.Client, method string, body io.Reader) ([]byte, error) {
   139  	u, err := url.Parse(addr)
   140  	if err != nil {
   141  		return nil, errors.Trace(err)
   142  	}
   143  	reqURL := fmt.Sprintf("%s/%s", u, prefix)
   144  	req, err := http.NewRequestWithContext(ctx, method, reqURL, body)
   145  	if err != nil {
   146  		return nil, errors.Trace(err)
   147  	}
   148  	resp, err := cli.Do(req)
   149  	if err != nil {
   150  		return nil, errors.Trace(err)
   151  	}
   152  	count := 0
   153  	for {
   154  		count++
   155  		if count > pdRequestRetryTime || resp.StatusCode < 500 {
   156  			break
   157  		}
   158  		resp.Body.Close()
   159  		time.Sleep(time.Second)
   160  		resp, err = cli.Do(req)
   161  		if err != nil {
   162  			return nil, errors.Trace(err)
   163  		}
   164  	}
   165  	defer resp.Body.Close()
   166  	if resp.StatusCode != http.StatusOK {
   167  		res, _ := io.ReadAll(resp.Body)
   168  		return nil, errors.Annotatef(berrors.ErrPDInvalidResponse, "[%d] %s %s", resp.StatusCode, res, reqURL)
   169  	}
   170  
   171  	r, err := io.ReadAll(resp.Body)
   172  	if err != nil {
   173  		return nil, errors.Trace(err)
   174  	}
   175  	return r, nil
   176  }
   177  
   178  // PdController manage get/update config from pd.
   179  type PdController struct {
   180  	addrs    []string
   181  	cli      *http.Client
   182  	pdClient pd.Client
   183  	version  *semver.Version
   184  
   185  	// control the pause schedulers goroutine
   186  	schedulerPauseCh chan struct{}
   187  }
   188  
   189  // NewPdController creates a new PdController.
   190  func NewPdController(
   191  	ctx context.Context,
   192  	pdAddrs string,
   193  	tlsConf *tls.Config,
   194  	securityOption pd.SecurityOption,
   195  ) (*PdController, error) {
   196  	cli := httputil.NewClient(tlsConf)
   197  
   198  	addrs := strings.Split(pdAddrs, ",")
   199  	processedAddrs := make([]string, 0, len(addrs))
   200  	var failure error
   201  	var versionBytes []byte
   202  	for _, addr := range addrs {
   203  		if !strings.HasPrefix(addr, "http") {
   204  			if tlsConf != nil {
   205  				addr = "https://" + addr
   206  			} else {
   207  				addr = "http://" + addr
   208  			}
   209  		}
   210  		processedAddrs = append(processedAddrs, addr)
   211  		versionBytes, failure = pdRequest(ctx, addr, clusterVersionPrefix, cli, http.MethodGet, nil)
   212  		if failure == nil {
   213  			break
   214  		}
   215  	}
   216  	if failure != nil {
   217  		return nil, errors.Annotatef(berrors.ErrPDUpdateFailed, "pd address (%s) not available, please check network", pdAddrs)
   218  	}
   219  
   220  	version := parseVersion(versionBytes)
   221  	maxCallMsgSize := []grpc.DialOption{
   222  		grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(maxMsgSize)),
   223  		grpc.WithDefaultCallOptions(grpc.MaxCallSendMsgSize(maxMsgSize)),
   224  	}
   225  	pdClient, err := pd.NewClientWithContext(
   226  		ctx, addrs, securityOption,
   227  		pd.WithGRPCDialOptions(maxCallMsgSize...),
   228  		pd.WithCustomTimeoutOption(10*time.Second),
   229  	)
   230  	if err != nil {
   231  		log.Error("fail to create pd client", zap.Error(err))
   232  		return nil, errors.Trace(err)
   233  	}
   234  
   235  	return &PdController{
   236  		addrs:    processedAddrs,
   237  		cli:      cli,
   238  		pdClient: pdClient,
   239  		version:  version,
   240  		// We should make a buffered channel here otherwise when context canceled,
   241  		// gracefully shutdown will stick at resuming schedulers.
   242  		schedulerPauseCh: make(chan struct{}, 1),
   243  	}, nil
   244  }
   245  
   246  func parseVersion(versionBytes []byte) *semver.Version {
   247  	// we need trim space or semver will parse failed
   248  	v := strings.TrimSpace(string(versionBytes))
   249  	v = strings.Trim(v, "\"")
   250  	v = strings.TrimPrefix(v, "v")
   251  	version, err := semver.NewVersion(v)
   252  	if err != nil {
   253  		log.Warn("fail back to v0.0.0 version",
   254  			zap.ByteString("version", versionBytes), zap.Error(err))
   255  		version = &semver.Version{Major: 0, Minor: 0, Patch: 0}
   256  	}
   257  	failpoint.Inject("PDEnabledPauseConfig", func(val failpoint.Value) {
   258  		if val.(bool) {
   259  			// test pause config is enable
   260  			version = &semver.Version{Major: 5, Minor: 0, Patch: 0}
   261  		}
   262  	})
   263  	return version
   264  }
   265  
   266  func (p *PdController) isPauseConfigEnabled() bool {
   267  	return p.version.Compare(pauseConfigVersion) >= 0
   268  }
   269  
   270  // SetHTTP set pd addrs and cli for test.
   271  func (p *PdController) SetHTTP(addrs []string, cli *http.Client) {
   272  	p.addrs = addrs
   273  	p.cli = cli
   274  }
   275  
   276  // SetPDClient set pd addrs and cli for test.
   277  func (p *PdController) SetPDClient(pdClient pd.Client) {
   278  	p.pdClient = pdClient
   279  }
   280  
   281  // GetPDClient set pd addrs and cli for test.
   282  func (p *PdController) GetPDClient() pd.Client {
   283  	return p.pdClient
   284  }
   285  
   286  // GetClusterVersion returns the current cluster version.
   287  func (p *PdController) GetClusterVersion(ctx context.Context) (string, error) {
   288  	return p.getClusterVersionWith(ctx, pdRequest)
   289  }
   290  
   291  func (p *PdController) getClusterVersionWith(ctx context.Context, get pdHTTPRequest) (string, error) {
   292  	var err error
   293  	for _, addr := range p.addrs {
   294  		v, e := get(ctx, addr, clusterVersionPrefix, p.cli, http.MethodGet, nil)
   295  		if e != nil {
   296  			err = e
   297  			continue
   298  		}
   299  		return string(v), nil
   300  	}
   301  
   302  	return "", errors.Trace(err)
   303  }
   304  
   305  // GetRegionCount returns the region count in the specified range.
   306  func (p *PdController) GetRegionCount(ctx context.Context, startKey, endKey []byte) (int, error) {
   307  	return p.getRegionCountWith(ctx, pdRequest, startKey, endKey)
   308  }
   309  
   310  func (p *PdController) getRegionCountWith(
   311  	ctx context.Context, get pdHTTPRequest, startKey, endKey []byte,
   312  ) (int, error) {
   313  	// TiKV reports region start/end keys to PD in memcomparable-format.
   314  	var start, end string
   315  	start = url.QueryEscape(string(codec.EncodeBytes(nil, startKey)))
   316  	if len(endKey) != 0 { // Empty end key means the max.
   317  		end = url.QueryEscape(string(codec.EncodeBytes(nil, endKey)))
   318  	}
   319  	var err error
   320  	for _, addr := range p.addrs {
   321  		query := fmt.Sprintf(
   322  			"%s?start_key=%s&end_key=%s",
   323  			regionCountPrefix, start, end)
   324  		v, e := get(ctx, addr, query, p.cli, http.MethodGet, nil)
   325  		if e != nil {
   326  			err = e
   327  			continue
   328  		}
   329  		regionsMap := make(map[string]interface{})
   330  		err = json.Unmarshal(v, &regionsMap)
   331  		if err != nil {
   332  			return 0, errors.Trace(err)
   333  		}
   334  		return int(regionsMap["count"].(float64)), nil
   335  	}
   336  	return 0, errors.Trace(err)
   337  }
   338  
   339  // GetStoreInfo returns the info of store with the specified id.
   340  func (p *PdController) GetStoreInfo(ctx context.Context, storeID uint64) (*pdapi.StoreInfo, error) {
   341  	return p.getStoreInfoWith(ctx, pdRequest, storeID)
   342  }
   343  
   344  func (p *PdController) getStoreInfoWith(
   345  	ctx context.Context, get pdHTTPRequest, storeID uint64) (*pdapi.StoreInfo, error) {
   346  	var err error
   347  	for _, addr := range p.addrs {
   348  		query := fmt.Sprintf(
   349  			"%s/%d",
   350  			storePrefix, storeID)
   351  		v, e := get(ctx, addr, query, p.cli, http.MethodGet, nil)
   352  		if e != nil {
   353  			err = e
   354  			continue
   355  		}
   356  		store := pdapi.StoreInfo{}
   357  		err = json.Unmarshal(v, &store)
   358  		if err != nil {
   359  			return nil, errors.Trace(err)
   360  		}
   361  		return &store, nil
   362  	}
   363  	return nil, errors.Trace(err)
   364  }
   365  
   366  func (p *PdController) doPauseSchedulers(ctx context.Context, schedulers []string, post pdHTTPRequest) ([]string, error) {
   367  	// pause this scheduler with 300 seconds
   368  	body, err := json.Marshal(pauseSchedulerBody{Delay: int64(pauseTimeout)})
   369  	if err != nil {
   370  		return nil, errors.Trace(err)
   371  	}
   372  	// PauseSchedulers remove pd scheduler temporarily.
   373  	removedSchedulers := make([]string, 0, len(schedulers))
   374  	for _, scheduler := range schedulers {
   375  		prefix := fmt.Sprintf("%s/%s", schedulerPrefix, scheduler)
   376  		for _, addr := range p.addrs {
   377  			_, err = post(ctx, addr, prefix, p.cli, http.MethodPost, bytes.NewBuffer(body))
   378  			if err == nil {
   379  				removedSchedulers = append(removedSchedulers, scheduler)
   380  				break
   381  			}
   382  		}
   383  		if err != nil {
   384  			return removedSchedulers, errors.Trace(err)
   385  		}
   386  	}
   387  	return removedSchedulers, nil
   388  }
   389  
   390  func (p *PdController) pauseSchedulersAndConfigWith(
   391  	ctx context.Context, schedulers []string,
   392  	schedulerCfg map[string]interface{}, post pdHTTPRequest,
   393  ) ([]string, error) {
   394  	// first pause this scheduler, if the first time failed. we should return the error
   395  	// so put first time out of for loop. and in for loop we could ignore other failed pause.
   396  	removedSchedulers, err := p.doPauseSchedulers(ctx, schedulers, post)
   397  	if err != nil {
   398  		log.Error("failed to pause scheduler at beginning",
   399  			zap.Strings("name", schedulers), zap.Error(err))
   400  		return nil, errors.Trace(err)
   401  	}
   402  	log.Info("pause scheduler successful at beginning", zap.Strings("name", schedulers))
   403  	if schedulerCfg != nil {
   404  		err = p.doPauseConfigs(ctx, schedulerCfg, post)
   405  		if err != nil {
   406  			log.Error("failed to pause config at beginning",
   407  				zap.Any("cfg", schedulerCfg), zap.Error(err))
   408  			return nil, errors.Trace(err)
   409  		}
   410  		log.Info("pause configs successful at beginning", zap.Any("cfg", schedulerCfg))
   411  	}
   412  
   413  	go func() {
   414  		tick := time.NewTicker(pauseTimeout / 3)
   415  		defer tick.Stop()
   416  
   417  		for {
   418  			select {
   419  			case <-ctx.Done():
   420  				return
   421  			case <-tick.C:
   422  				_, err := p.doPauseSchedulers(ctx, schedulers, post)
   423  				if err != nil {
   424  					log.Warn("pause scheduler failed, ignore it and wait next time pause", zap.Error(err))
   425  				}
   426  				if schedulerCfg != nil {
   427  					err = p.doPauseConfigs(ctx, schedulerCfg, post)
   428  					if err != nil {
   429  						log.Warn("pause configs failed, ignore it and wait next time pause", zap.Error(err))
   430  					}
   431  				}
   432  				log.Info("pause scheduler(configs)", zap.Strings("name", removedSchedulers),
   433  					zap.Any("cfg", schedulerCfg))
   434  			case <-p.schedulerPauseCh:
   435  				log.Info("exit pause scheduler and configs successful")
   436  				return
   437  			}
   438  		}
   439  	}()
   440  	return removedSchedulers, nil
   441  }
   442  
   443  // ResumeSchedulers resume pd scheduler.
   444  func (p *PdController) ResumeSchedulers(ctx context.Context, schedulers []string) error {
   445  	return p.resumeSchedulerWith(ctx, schedulers, pdRequest)
   446  }
   447  
   448  func (p *PdController) resumeSchedulerWith(ctx context.Context, schedulers []string, post pdHTTPRequest) (err error) {
   449  	log.Info("resume scheduler", zap.Strings("schedulers", schedulers))
   450  	p.schedulerPauseCh <- struct{}{}
   451  
   452  	// 0 means stop pause.
   453  	body, err := json.Marshal(pauseSchedulerBody{Delay: 0})
   454  	if err != nil {
   455  		return errors.Trace(err)
   456  	}
   457  	for _, scheduler := range schedulers {
   458  		prefix := fmt.Sprintf("%s/%s", schedulerPrefix, scheduler)
   459  		for _, addr := range p.addrs {
   460  			_, err = post(ctx, addr, prefix, p.cli, http.MethodPost, bytes.NewBuffer(body))
   461  			if err == nil {
   462  				break
   463  			}
   464  		}
   465  		if err != nil {
   466  			log.Error("failed to resume scheduler after retry, you may reset this scheduler manually"+
   467  				"or just wait this scheduler pause timeout", zap.String("scheduler", scheduler))
   468  		} else {
   469  			log.Info("resume scheduler successful", zap.String("scheduler", scheduler))
   470  		}
   471  	}
   472  	// no need to return error, because the pause will timeout.
   473  	return nil
   474  }
   475  
   476  // ListSchedulers list all pd scheduler.
   477  func (p *PdController) ListSchedulers(ctx context.Context) ([]string, error) {
   478  	return p.listSchedulersWith(ctx, pdRequest)
   479  }
   480  
   481  func (p *PdController) listSchedulersWith(ctx context.Context, get pdHTTPRequest) ([]string, error) {
   482  	var err error
   483  	for _, addr := range p.addrs {
   484  		v, e := get(ctx, addr, schedulerPrefix, p.cli, http.MethodGet, nil)
   485  		if e != nil {
   486  			err = e
   487  			continue
   488  		}
   489  		d := make([]string, 0)
   490  		err = json.Unmarshal(v, &d)
   491  		if err != nil {
   492  			return nil, errors.Trace(err)
   493  		}
   494  		return d, nil
   495  	}
   496  	return nil, errors.Trace(err)
   497  }
   498  
   499  // GetPDScheduleConfig returns PD schedule config value associated with the key.
   500  // It returns nil if there is no such config item.
   501  func (p *PdController) GetPDScheduleConfig(
   502  	ctx context.Context,
   503  ) (map[string]interface{}, error) {
   504  	var err error
   505  	for _, addr := range p.addrs {
   506  		v, e := pdRequest(
   507  			ctx, addr, scheduleConfigPrefix, p.cli, http.MethodGet, nil)
   508  		if e != nil {
   509  			err = e
   510  			continue
   511  		}
   512  		cfg := make(map[string]interface{})
   513  		err = json.Unmarshal(v, &cfg)
   514  		if err != nil {
   515  			return nil, errors.Trace(err)
   516  		}
   517  		return cfg, nil
   518  	}
   519  	return nil, errors.Trace(err)
   520  }
   521  
   522  // UpdatePDScheduleConfig updates PD schedule config value associated with the key.
   523  func (p *PdController) UpdatePDScheduleConfig(ctx context.Context) error {
   524  	log.Info("update pd with default config", zap.Any("cfg", defaultPDCfg))
   525  	return p.doUpdatePDScheduleConfig(ctx, defaultPDCfg, pdRequest)
   526  }
   527  
   528  func (p *PdController) doUpdatePDScheduleConfig(
   529  	ctx context.Context, cfg map[string]interface{}, post pdHTTPRequest, prefixs ...string,
   530  ) error {
   531  	prefix := scheduleConfigPrefix
   532  	if len(prefixs) != 0 {
   533  		prefix = prefixs[0]
   534  	}
   535  	for _, addr := range p.addrs {
   536  		reqData, err := json.Marshal(cfg)
   537  		if err != nil {
   538  			return errors.Trace(err)
   539  		}
   540  		_, e := post(ctx, addr, prefix,
   541  			p.cli, http.MethodPost, bytes.NewBuffer(reqData))
   542  		if e == nil {
   543  			return nil
   544  		}
   545  		log.Warn("failed to update PD config, will try next", zap.Error(e), zap.String("pd", addr))
   546  	}
   547  	return errors.Annotate(berrors.ErrPDUpdateFailed, "failed to update PD schedule config")
   548  }
   549  
   550  func (p *PdController) doPauseConfigs(ctx context.Context, cfg map[string]interface{}, post pdHTTPRequest) error {
   551  	// pause this scheduler with 300 seconds
   552  	prefix := fmt.Sprintf("%s?ttlSecond=%.0f", scheduleConfigPrefix, pauseTimeout.Seconds())
   553  	return p.doUpdatePDScheduleConfig(ctx, cfg, post, prefix)
   554  }
   555  
   556  func restoreSchedulers(ctx context.Context, pd *PdController, clusterCfg ClusterConfig) error {
   557  	if err := pd.ResumeSchedulers(ctx, clusterCfg.Schedulers); err != nil {
   558  		return errors.Annotate(err, "fail to add PD schedulers")
   559  	}
   560  	log.Info("restoring config", zap.Any("config", clusterCfg.ScheduleCfg))
   561  	mergeCfg := make(map[string]interface{})
   562  	for cfgKey := range expectPDCfg {
   563  		value := clusterCfg.ScheduleCfg[cfgKey]
   564  		if value == nil {
   565  			// Ignore non-exist config.
   566  			continue
   567  		}
   568  		mergeCfg[cfgKey] = value
   569  	}
   570  
   571  	prefix := make([]string, 0, 1)
   572  	if pd.isPauseConfigEnabled() {
   573  		// set config's ttl to zero, make temporary config invalid immediately.
   574  		prefix = append(prefix, fmt.Sprintf("%s?ttlSecond=%d", scheduleConfigPrefix, 0))
   575  	}
   576  	// reset config with previous value.
   577  	if err := pd.doUpdatePDScheduleConfig(ctx, mergeCfg, pdRequest, prefix...); err != nil {
   578  		return errors.Annotate(err, "fail to update PD merge config")
   579  	}
   580  	return nil
   581  }
   582  
   583  // MakeUndoFunctionByConfig return an UndoFunc based on specified ClusterConfig
   584  func (p *PdController) MakeUndoFunctionByConfig(config ClusterConfig) UndoFunc {
   585  	restore := func(ctx context.Context) error {
   586  		return restoreSchedulers(ctx, p, config)
   587  	}
   588  	return restore
   589  }
   590  
   591  // RemoveSchedulers removes the schedulers that may slow down BR speed.
   592  func (p *PdController) RemoveSchedulers(ctx context.Context) (undo UndoFunc, err error) {
   593  	undo = Nop
   594  
   595  	origin, _, err1 := p.RemoveSchedulersWithOrigin(ctx)
   596  	if err1 != nil {
   597  		err = err1
   598  		return
   599  	}
   600  
   601  	undo = p.MakeUndoFunctionByConfig(ClusterConfig{Schedulers: origin.Schedulers, ScheduleCfg: origin.ScheduleCfg})
   602  	return undo, errors.Trace(err)
   603  }
   604  
   605  // RemoveSchedulersWithOrigin pause and remove br related schedule configs and return the origin and modified configs
   606  func (p *PdController) RemoveSchedulersWithOrigin(ctx context.Context) (ClusterConfig, ClusterConfig, error) {
   607  	if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil {
   608  		span1 := span.Tracer().StartSpan("PdController.RemoveSchedulers", opentracing.ChildOf(span.Context()))
   609  		defer span1.Finish()
   610  		ctx = opentracing.ContextWithSpan(ctx, span1)
   611  	}
   612  
   613  	originCfg := ClusterConfig{}
   614  	removedCfg := ClusterConfig{}
   615  	stores, err := p.pdClient.GetAllStores(ctx)
   616  	if err != nil {
   617  		return originCfg, removedCfg, err
   618  	}
   619  	scheduleCfg, err := p.GetPDScheduleConfig(ctx)
   620  	if err != nil {
   621  		return originCfg, removedCfg, err
   622  	}
   623  	disablePDCfg := make(map[string]interface{}, len(expectPDCfg))
   624  	originPDCfg := make(map[string]interface{}, len(expectPDCfg))
   625  	for cfgKey, cfgValFunc := range expectPDCfg {
   626  		value, ok := scheduleCfg[cfgKey]
   627  		if !ok {
   628  			// Ignore non-exist config.
   629  			continue
   630  		}
   631  		disablePDCfg[cfgKey] = cfgValFunc(len(stores), value)
   632  		originPDCfg[cfgKey] = value
   633  	}
   634  	originCfg.ScheduleCfg = originPDCfg
   635  	removedCfg.ScheduleCfg = disablePDCfg
   636  
   637  	log.Debug("saved PD config", zap.Any("config", scheduleCfg))
   638  
   639  	// Remove default PD scheduler that may affect restore process.
   640  	existSchedulers, err := p.ListSchedulers(ctx)
   641  	if err != nil {
   642  		return originCfg, removedCfg, err
   643  	}
   644  	needRemoveSchedulers := make([]string, 0, len(existSchedulers))
   645  	for _, s := range existSchedulers {
   646  		if _, ok := Schedulers[s]; ok {
   647  			needRemoveSchedulers = append(needRemoveSchedulers, s)
   648  		}
   649  	}
   650  
   651  	removedSchedulers, err := p.doRemoveSchedulersWith(ctx, needRemoveSchedulers, disablePDCfg)
   652  	if err != nil {
   653  		return originCfg, removedCfg, err
   654  	}
   655  
   656  	originCfg.Schedulers = removedSchedulers
   657  	removedCfg.Schedulers = removedSchedulers
   658  
   659  	return originCfg, removedCfg, nil
   660  }
   661  
   662  // RemoveSchedulersWithCfg removes pd schedulers and configs with specified ClusterConfig
   663  func (p *PdController) RemoveSchedulersWithCfg(ctx context.Context, removeCfg ClusterConfig) error {
   664  	_, err := p.doRemoveSchedulersWith(ctx, removeCfg.Schedulers, removeCfg.ScheduleCfg)
   665  	return err
   666  }
   667  
   668  func (p *PdController) doRemoveSchedulersWith(
   669  	ctx context.Context,
   670  	needRemoveSchedulers []string,
   671  	disablePDCfg map[string]interface{},
   672  ) ([]string, error) {
   673  	var removedSchedulers []string
   674  	var err error
   675  	if p.isPauseConfigEnabled() {
   676  		// after 4.0.8 we can set these config with TTL
   677  		removedSchedulers, err = p.pauseSchedulersAndConfigWith(ctx, needRemoveSchedulers, disablePDCfg, pdRequest)
   678  	} else {
   679  		// adapt to earlier version (before 4.0.8) of pd cluster
   680  		// which doesn't have temporary config setting.
   681  		err = p.doUpdatePDScheduleConfig(ctx, disablePDCfg, pdRequest)
   682  		if err != nil {
   683  			return nil, err
   684  		}
   685  		removedSchedulers, err = p.pauseSchedulersAndConfigWith(ctx, needRemoveSchedulers, nil, pdRequest)
   686  	}
   687  	return removedSchedulers, err
   688  }
   689  
   690  // Close close the connection to pd.
   691  func (p *PdController) Close() {
   692  	p.pdClient.Close()
   693  	close(p.schedulerPauseCh)
   694  }
   695  
   696  // FetchPDVersion get pd version
   697  func FetchPDVersion(ctx context.Context, tls *common.TLS, pdAddr string) (*semver.Version, error) {
   698  	// An example of PD version API.
   699  	// curl http://pd_address/pd/api/v1/version
   700  	// {
   701  	//   "version": "v4.0.0-rc.2-451-g760fb650"
   702  	// }
   703  	var rawVersion struct {
   704  		Version string `json:"version"`
   705  	}
   706  	err := tls.WithHost(pdAddr).GetJSON(ctx, "/pd/api/v1/version", &rawVersion)
   707  	if err != nil {
   708  		return nil, errors.Trace(err)
   709  	}
   710  
   711  	return parseVersion([]byte(rawVersion.Version)), nil
   712  }