github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/cdc/server.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package cdc
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"net/http"
    20  	"os"
    21  	"path/filepath"
    22  	"strings"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/pingcap/errors"
    27  	"github.com/pingcap/log"
    28  	"github.com/pingcap/ticdc/cdc/capture"
    29  	"github.com/pingcap/ticdc/cdc/kv"
    30  	"github.com/pingcap/ticdc/cdc/puller/sorter"
    31  	"github.com/pingcap/ticdc/pkg/config"
    32  	cerror "github.com/pingcap/ticdc/pkg/errors"
    33  	"github.com/pingcap/ticdc/pkg/httputil"
    34  	"github.com/pingcap/ticdc/pkg/util"
    35  	"github.com/pingcap/ticdc/pkg/version"
    36  	tidbkv "github.com/pingcap/tidb/kv"
    37  	"github.com/prometheus/client_golang/prometheus"
    38  	pd "github.com/tikv/pd/client"
    39  	"go.etcd.io/etcd/clientv3"
    40  	"go.etcd.io/etcd/mvcc"
    41  	"go.etcd.io/etcd/pkg/logutil"
    42  	"go.uber.org/zap"
    43  	"go.uber.org/zap/zapcore"
    44  	"golang.org/x/sync/errgroup"
    45  	"golang.org/x/time/rate"
    46  	"google.golang.org/grpc"
    47  	"google.golang.org/grpc/backoff"
    48  )
    49  
    50  const (
    51  	ownerRunInterval = time.Millisecond * 500
    52  	defaultDataDir   = "/tmp/cdc_data"
    53  	// dataDirThreshold is used to warn if the free space of the specified data-dir is lower than it, unit is GB
    54  	dataDirThreshold = 500
    55  )
    56  
    57  // Server is the capture server
    58  type Server struct {
    59  	captureV2 *capture.Capture
    60  
    61  	capture      *Capture
    62  	owner        *Owner
    63  	ownerLock    sync.RWMutex
    64  	statusServer *http.Server
    65  	pdClient     pd.Client
    66  	etcdClient   *kv.CDCEtcdClient
    67  	kvStorage    tidbkv.Storage
    68  	pdEndpoints  []string
    69  }
    70  
    71  // NewServer creates a Server instance.
    72  func NewServer(pdEndpoints []string) (*Server, error) {
    73  	conf := config.GetGlobalServerConfig()
    74  	log.Info("creating CDC server",
    75  		zap.Strings("pd-addrs", pdEndpoints),
    76  		zap.Stringer("config", conf),
    77  	)
    78  
    79  	s := &Server{
    80  		pdEndpoints: pdEndpoints,
    81  	}
    82  	return s, nil
    83  }
    84  
    85  // Run runs the server.
    86  func (s *Server) Run(ctx context.Context) error {
    87  	conf := config.GetGlobalServerConfig()
    88  
    89  	grpcTLSOption, err := conf.Security.ToGRPCDialOption()
    90  	if err != nil {
    91  		return errors.Trace(err)
    92  	}
    93  	pdClient, err := pd.NewClientWithContext(
    94  		ctx, s.pdEndpoints, conf.Security.PDSecurityOption(),
    95  		pd.WithGRPCDialOptions(
    96  			grpcTLSOption,
    97  			grpc.WithBlock(),
    98  			grpc.WithConnectParams(grpc.ConnectParams{
    99  				Backoff: backoff.Config{
   100  					BaseDelay:  time.Second,
   101  					Multiplier: 1.1,
   102  					Jitter:     0.1,
   103  					MaxDelay:   3 * time.Second,
   104  				},
   105  				MinConnectTimeout: 3 * time.Second,
   106  			}),
   107  		))
   108  	if err != nil {
   109  		return cerror.WrapError(cerror.ErrServerNewPDClient, err)
   110  	}
   111  	s.pdClient = pdClient
   112  	if config.NewReplicaImpl {
   113  		tlsConfig, err := conf.Security.ToTLSConfig()
   114  		if err != nil {
   115  			return errors.Trace(err)
   116  		}
   117  		logConfig := logutil.DefaultZapLoggerConfig
   118  		logConfig.Level = zap.NewAtomicLevelAt(zapcore.ErrorLevel)
   119  		etcdCli, err := clientv3.New(clientv3.Config{
   120  			Endpoints:   s.pdEndpoints,
   121  			TLS:         tlsConfig,
   122  			Context:     ctx,
   123  			LogConfig:   &logConfig,
   124  			DialTimeout: 5 * time.Second,
   125  			DialOptions: []grpc.DialOption{
   126  				grpcTLSOption,
   127  				grpc.WithBlock(),
   128  				grpc.WithConnectParams(grpc.ConnectParams{
   129  					Backoff: backoff.Config{
   130  						BaseDelay:  time.Second,
   131  						Multiplier: 1.1,
   132  						Jitter:     0.1,
   133  						MaxDelay:   3 * time.Second,
   134  					},
   135  					MinConnectTimeout: 3 * time.Second,
   136  				}),
   137  			},
   138  		})
   139  		if err != nil {
   140  			return errors.Annotate(cerror.WrapError(cerror.ErrNewCaptureFailed, err), "new etcd client")
   141  		}
   142  		etcdClient := kv.NewCDCEtcdClient(ctx, etcdCli)
   143  		s.etcdClient = &etcdClient
   144  	}
   145  
   146  	if err := s.initDataDir(ctx); err != nil {
   147  		return errors.Trace(err)
   148  	}
   149  	// To not block CDC server startup, we need to warn instead of error
   150  	// when TiKV is incompatible.
   151  	errorTiKVIncompatible := false
   152  	for _, pdEndpoint := range s.pdEndpoints {
   153  		err = version.CheckClusterVersion(ctx, s.pdClient, pdEndpoint, conf.Security, errorTiKVIncompatible)
   154  		if err == nil {
   155  			break
   156  		}
   157  	}
   158  	if err != nil {
   159  		return err
   160  	}
   161  	err = s.startStatusHTTP()
   162  	if err != nil {
   163  		return err
   164  	}
   165  
   166  	kv.InitWorkerPool()
   167  	kvStore, err := kv.CreateTiStore(strings.Join(s.pdEndpoints, ","), conf.Security)
   168  	if err != nil {
   169  		return errors.Trace(err)
   170  	}
   171  	defer func() {
   172  		err := kvStore.Close()
   173  		if err != nil {
   174  			log.Warn("kv store close failed", zap.Error(err))
   175  		}
   176  	}()
   177  	s.kvStorage = kvStore
   178  	ctx = util.PutKVStorageInCtx(ctx, kvStore)
   179  	if config.NewReplicaImpl {
   180  		s.captureV2 = capture.NewCapture(s.pdClient, s.kvStorage, s.etcdClient)
   181  		return s.run(ctx)
   182  	}
   183  	// When a capture suicided, restart it
   184  	for {
   185  		if err := s.run(ctx); cerror.ErrCaptureSuicide.NotEqual(err) {
   186  			return err
   187  		}
   188  		log.Info("server recovered", zap.String("capture-id", s.capture.info.ID))
   189  	}
   190  }
   191  
   192  func (s *Server) setOwner(owner *Owner) {
   193  	s.ownerLock.Lock()
   194  	defer s.ownerLock.Unlock()
   195  	s.owner = owner
   196  }
   197  
   198  func (s *Server) campaignOwnerLoop(ctx context.Context) error {
   199  	// In most failure cases, we don't return error directly, just run another
   200  	// campaign loop. We treat campaign loop as a special background routine.
   201  
   202  	conf := config.GetGlobalServerConfig()
   203  	rl := rate.NewLimiter(0.05, 2)
   204  	for {
   205  		err := rl.Wait(ctx)
   206  		if err != nil {
   207  			if errors.Cause(err) == context.Canceled {
   208  				return nil
   209  			}
   210  			return errors.Trace(err)
   211  		}
   212  
   213  		// Campaign to be an owner, it blocks until it becomes the owner
   214  		if err := s.capture.Campaign(ctx); err != nil {
   215  			switch errors.Cause(err) {
   216  			case context.Canceled:
   217  				return nil
   218  			case mvcc.ErrCompacted:
   219  				continue
   220  			}
   221  			log.Warn("campaign owner failed", zap.Error(err))
   222  			continue
   223  		}
   224  		captureID := s.capture.info.ID
   225  		log.Info("campaign owner successfully", zap.String("capture-id", captureID))
   226  		owner, err := NewOwner(ctx, s.pdClient, s.capture.grpcPool, s.capture.session, conf.GcTTL, time.Duration(conf.OwnerFlushInterval))
   227  		if err != nil {
   228  			log.Warn("create new owner failed", zap.Error(err))
   229  			continue
   230  		}
   231  
   232  		s.setOwner(owner)
   233  		if err := owner.Run(ctx, ownerRunInterval); err != nil {
   234  			if errors.Cause(err) == context.Canceled {
   235  				log.Info("owner exited", zap.String("capture-id", captureID))
   236  				select {
   237  				case <-ctx.Done():
   238  					// only exits the campaignOwnerLoop if parent context is done
   239  					return ctx.Err()
   240  				default:
   241  				}
   242  				log.Info("owner exited", zap.String("capture-id", captureID))
   243  			}
   244  			err2 := s.capture.Resign(ctx)
   245  			if err2 != nil {
   246  				// if resign owner failed, return error to let capture exits
   247  				return errors.Annotatef(err2, "resign owner failed, capture: %s", captureID)
   248  			}
   249  			log.Warn("run owner failed", zap.Error(err))
   250  		}
   251  		// owner is resigned by API, reset owner and continue the campaign loop
   252  		s.setOwner(nil)
   253  	}
   254  }
   255  
   256  func (s *Server) etcdHealthChecker(ctx context.Context) error {
   257  	ticker := time.NewTicker(time.Second * 3)
   258  	defer ticker.Stop()
   259  	conf := config.GetGlobalServerConfig()
   260  
   261  	httpCli, err := httputil.NewClient(conf.Security)
   262  	if err != nil {
   263  		return err
   264  	}
   265  	defer httpCli.CloseIdleConnections()
   266  	metrics := make(map[string]prometheus.Observer)
   267  	for _, pdEndpoint := range s.pdEndpoints {
   268  		metrics[pdEndpoint] = etcdHealthCheckDuration.WithLabelValues(conf.AdvertiseAddr, pdEndpoint)
   269  	}
   270  
   271  	for {
   272  		select {
   273  		case <-ctx.Done():
   274  			return ctx.Err()
   275  		case <-ticker.C:
   276  			for _, pdEndpoint := range s.pdEndpoints {
   277  				start := time.Now()
   278  				ctx, cancel := context.WithTimeout(ctx, time.Second*10)
   279  				req, err := http.NewRequestWithContext(
   280  					ctx, http.MethodGet, fmt.Sprintf("%s/health", pdEndpoint), nil)
   281  				if err != nil {
   282  					log.Warn("etcd health check failed", zap.Error(err))
   283  					cancel()
   284  					continue
   285  				}
   286  				_, err = httpCli.Do(req)
   287  				if err != nil {
   288  					log.Warn("etcd health check error", zap.Error(err))
   289  				} else {
   290  					metrics[pdEndpoint].Observe(float64(time.Since(start)) / float64(time.Second))
   291  				}
   292  				cancel()
   293  			}
   294  		}
   295  	}
   296  }
   297  
   298  func (s *Server) run(ctx context.Context) (err error) {
   299  	if !config.NewReplicaImpl {
   300  		kvStorage, err := util.KVStorageFromCtx(ctx)
   301  		if err != nil {
   302  			return errors.Trace(err)
   303  		}
   304  		if s.capture != nil && s.capture.session != nil {
   305  			if err := s.capture.session.Close(); err != nil {
   306  				log.Warn("close old capture session failed", zap.Error(err))
   307  			}
   308  		}
   309  		capture, err := NewCapture(ctx, s.pdEndpoints, s.pdClient, kvStorage)
   310  		if err != nil {
   311  			return err
   312  		}
   313  		s.capture = capture
   314  		s.etcdClient = &capture.etcdClient
   315  		conf := config.GetGlobalServerConfig()
   316  		defer func() {
   317  			timeoutCtx, cancel := context.WithTimeout(context.Background(), time.Duration(conf.CaptureSessionTTL)*time.Second)
   318  			if err := s.etcdClient.DeleteCaptureInfo(timeoutCtx, s.capture.info.ID); err != nil {
   319  				log.Warn("failed to delete capture info when capture exited", zap.Error(err))
   320  			}
   321  			cancel()
   322  		}()
   323  	}
   324  	ctx, cancel := context.WithCancel(ctx)
   325  	defer cancel()
   326  
   327  	wg, cctx := errgroup.WithContext(ctx)
   328  	if config.NewReplicaImpl {
   329  		wg.Go(func() error {
   330  			return s.captureV2.Run(cctx)
   331  		})
   332  	} else {
   333  		wg.Go(func() error {
   334  			return s.campaignOwnerLoop(cctx)
   335  		})
   336  
   337  		wg.Go(func() error {
   338  			return s.capture.Run(cctx)
   339  		})
   340  	}
   341  	wg.Go(func() error {
   342  		return s.etcdHealthChecker(cctx)
   343  	})
   344  
   345  	wg.Go(func() error {
   346  		return sorter.RunWorkerPool(cctx)
   347  	})
   348  
   349  	wg.Go(func() error {
   350  		return kv.RunWorkerPool(cctx)
   351  	})
   352  
   353  	return wg.Wait()
   354  }
   355  
   356  // Close closes the server.
   357  func (s *Server) Close() {
   358  	if s.capture != nil {
   359  		if !config.NewReplicaImpl {
   360  			s.capture.Cleanup()
   361  		}
   362  		closeCtx, closeCancel := context.WithTimeout(context.Background(), time.Second*2)
   363  		err := s.capture.Close(closeCtx)
   364  		if err != nil {
   365  			log.Error("close capture", zap.Error(err))
   366  		}
   367  		closeCancel()
   368  	}
   369  	if s.captureV2 != nil {
   370  		s.captureV2.AsyncClose()
   371  	}
   372  	if s.statusServer != nil {
   373  		err := s.statusServer.Close()
   374  		if err != nil {
   375  			log.Error("close status server", zap.Error(err))
   376  		}
   377  		s.statusServer = nil
   378  	}
   379  }
   380  
   381  func (s *Server) initDataDir(ctx context.Context) error {
   382  	if err := s.setUpDataDir(ctx); err != nil {
   383  		return errors.Trace(err)
   384  	}
   385  	conf := config.GetGlobalServerConfig()
   386  	err := os.MkdirAll(conf.DataDir, 0o755)
   387  	if err != nil {
   388  		return errors.Trace(err)
   389  	}
   390  	diskInfo, err := util.GetDiskInfo(conf.DataDir)
   391  	if err != nil {
   392  		return errors.Trace(err)
   393  	}
   394  
   395  	log.Info(fmt.Sprintf("%s is set as data-dir (%dGB available), sort-dir=%s. "+
   396  		"It is recommended that the disk for data-dir at least have %dGB available space", conf.DataDir, diskInfo.Avail, conf.Sorter.SortDir, dataDirThreshold))
   397  
   398  	return nil
   399  }
   400  
   401  func (s *Server) setUpDataDir(ctx context.Context) error {
   402  	conf := config.GetGlobalServerConfig()
   403  	if conf.DataDir != "" {
   404  		conf.Sorter.SortDir = filepath.Join(conf.DataDir, config.DefaultSortDir)
   405  		config.StoreGlobalServerConfig(conf)
   406  
   407  		return nil
   408  	}
   409  
   410  	// s.etcdClient maybe nil if NewReplicaImpl is not set to true
   411  	// todo: remove this after NewReplicaImpl set to true in a specific branch, and use server.etcdClient instead.
   412  	cli := s.etcdClient
   413  	if cli == nil {
   414  		client, err := clientv3.New(clientv3.Config{
   415  			Endpoints:   s.pdEndpoints,
   416  			Context:     ctx,
   417  			DialTimeout: 5 * time.Second,
   418  		})
   419  		if err != nil {
   420  			return err
   421  		}
   422  		etcdClient := kv.NewCDCEtcdClient(ctx, client)
   423  		cli = &etcdClient
   424  		defer cli.Close()
   425  	}
   426  
   427  	// data-dir will be decide by exist changefeed for backward compatibility
   428  	allStatus, err := cli.GetAllChangeFeedStatus(ctx)
   429  	if err != nil {
   430  		return errors.Trace(err)
   431  	}
   432  
   433  	candidates := make([]string, 0, len(allStatus))
   434  	for id := range allStatus {
   435  		info, err := cli.GetChangeFeedInfo(ctx, id)
   436  		if err != nil {
   437  			return errors.Trace(err)
   438  		}
   439  		if info.SortDir != "" {
   440  			candidates = append(candidates, info.SortDir)
   441  		}
   442  	}
   443  
   444  	conf.DataDir = defaultDataDir
   445  	best, ok := findBestDataDir(candidates)
   446  	if ok {
   447  		conf.DataDir = best
   448  	}
   449  
   450  	conf.Sorter.SortDir = filepath.Join(conf.DataDir, config.DefaultSortDir)
   451  	config.StoreGlobalServerConfig(conf)
   452  	return nil
   453  }
   454  
   455  // try to find the best data dir by rules
   456  // at the moment, only consider available disk space
   457  func findBestDataDir(candidates []string) (result string, ok bool) {
   458  	var low uint64 = 0
   459  
   460  	checker := func(dir string) (*util.DiskInfo, error) {
   461  		if err := os.MkdirAll(dir, 0o755); err != nil {
   462  			return nil, err
   463  		}
   464  		if err := util.IsDirReadWritable(dir); err != nil {
   465  			return nil, err
   466  		}
   467  		info, err := util.GetDiskInfo(dir)
   468  		if err != nil {
   469  			return nil, err
   470  		}
   471  		return info, err
   472  	}
   473  
   474  	for _, dir := range candidates {
   475  		info, err := checker(dir)
   476  		if err != nil {
   477  			log.Warn("check the availability of dir", zap.String("dir", dir), zap.Error(err))
   478  			continue
   479  		}
   480  		if info.Avail > low {
   481  			result = dir
   482  			low = info.Avail
   483  			ok = true
   484  		}
   485  	}
   486  
   487  	if !ok && len(candidates) != 0 {
   488  		log.Warn("try to find directory for data-dir failed, use `/tmp/cdc_data` as data-dir", zap.Strings("candidates", candidates))
   489  	}
   490  
   491  	return result, ok
   492  }