github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/upstream/upstream.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package upstream
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"strings"
    20  	"sync"
    21  	"sync/atomic"
    22  	"time"
    23  
    24  	"github.com/benbjohnson/clock"
    25  	dmysql "github.com/go-sql-driver/mysql"
    26  	"github.com/pingcap/log"
    27  	tidbkv "github.com/pingcap/tidb/pkg/kv"
    28  	"github.com/pingcap/tiflow/cdc/kv"
    29  	"github.com/pingcap/tiflow/pkg/config"
    30  	"github.com/pingcap/tiflow/pkg/errors"
    31  	"github.com/pingcap/tiflow/pkg/errorutil"
    32  	"github.com/pingcap/tiflow/pkg/etcd"
    33  	"github.com/pingcap/tiflow/pkg/pdutil"
    34  	"github.com/pingcap/tiflow/pkg/security"
    35  	pmysql "github.com/pingcap/tiflow/pkg/sink/mysql"
    36  	"github.com/pingcap/tiflow/pkg/txnutil/gc"
    37  	"github.com/pingcap/tiflow/pkg/version"
    38  	"github.com/prometheus/client_golang/prometheus"
    39  	tikvconfig "github.com/tikv/client-go/v2/config"
    40  	"github.com/tikv/client-go/v2/tikv"
    41  	pd "github.com/tikv/pd/client"
    42  	uatomic "github.com/uber-go/atomic"
    43  	clientv3 "go.etcd.io/etcd/client/v3"
    44  	"go.etcd.io/etcd/client/v3/concurrency"
    45  	"go.uber.org/zap"
    46  	"google.golang.org/grpc"
    47  	"google.golang.org/grpc/backoff"
    48  )
    49  
    50  const (
    51  	// indicate an upstream is created but not initialized.
    52  	uninit int32 = iota
    53  	// indicate an upstream is initialized and can work normally.
    54  	normal
    55  	// indicate an upstream is closing
    56  	closing
    57  	// indicate an upstream is closed.
    58  	closed
    59  
    60  	maxIdleDuration = time.Minute * 30
    61  )
    62  
    63  // Upstream holds resources of a TiDB cluster, it can be shared by many changefeeds
    64  // and processors. All public fields and method of an upstream should be thread-safe.
    65  // Please be careful that never change any exported field of an Upstream.
    66  type Upstream struct {
    67  	ID uint64
    68  
    69  	PdEndpoints    []string
    70  	SecurityConfig *security.Credential
    71  	PDClient       pd.Client
    72  	etcdCli        *etcd.Client
    73  	session        *concurrency.Session
    74  
    75  	KVStorage   tidbkv.Storage
    76  	GrpcPool    kv.GrpcPool
    77  	RegionCache *tikv.RegionCache
    78  	PDClock     pdutil.Clock
    79  	GCManager   gc.Manager
    80  	// Only use in Close().
    81  	cancel func()
    82  	mu     sync.Mutex
    83  	// record the time when Upstream.hc becomes zero.
    84  	idleTime time.Time
    85  	// use clock to facilitate unit test
    86  	clock  clock.Clock
    87  	wg     *sync.WaitGroup
    88  	status int32
    89  
    90  	err               uatomic.Error
    91  	isDefaultUpstream bool
    92  }
    93  
    94  func newUpstream(pdEndpoints []string,
    95  	securityConfig *security.Credential,
    96  ) *Upstream {
    97  	return &Upstream{
    98  		PdEndpoints:    pdEndpoints,
    99  		SecurityConfig: securityConfig,
   100  		status:         uninit,
   101  		wg:             new(sync.WaitGroup),
   102  		clock:          clock.New(),
   103  	}
   104  }
   105  
   106  // NewUpstream4Test new an upstream for unit test.
   107  func NewUpstream4Test(pdClient pd.Client) *Upstream {
   108  	pdClock := pdutil.NewClock4Test()
   109  	gcManager := gc.NewManager(
   110  		etcd.GcServiceIDForTest(),
   111  		pdClient, pdClock)
   112  	res := &Upstream{
   113  		ID:             testUpstreamID,
   114  		PDClient:       pdClient,
   115  		PDClock:        pdClock,
   116  		GCManager:      gcManager,
   117  		status:         normal,
   118  		wg:             new(sync.WaitGroup),
   119  		clock:          clock.New(),
   120  		SecurityConfig: &security.Credential{},
   121  		cancel:         func() {},
   122  	}
   123  
   124  	return res
   125  }
   126  
   127  // init initializes the upstream
   128  func initUpstream(ctx context.Context, up *Upstream, cfg CaptureTopologyCfg) error {
   129  	ctx, up.cancel = context.WithCancel(ctx)
   130  	grpcTLSOption, err := up.SecurityConfig.ToGRPCDialOption()
   131  	if err != nil {
   132  		up.err.Store(err)
   133  		return errors.Trace(err)
   134  	}
   135  	// init the tikv client tls global config
   136  	initGlobalConfig(up.SecurityConfig)
   137  	// default upstream always use the pdClient pass from cdc server
   138  	if !up.isDefaultUpstream {
   139  		up.PDClient, err = pd.NewClientWithContext(
   140  			ctx, up.PdEndpoints, up.SecurityConfig.PDSecurityOption(),
   141  			// the default `timeout` is 3s, maybe too small if the pd is busy,
   142  			// set to 10s to avoid frequent timeout.
   143  			pd.WithCustomTimeoutOption(10*time.Second),
   144  			pd.WithGRPCDialOptions(
   145  				grpcTLSOption,
   146  				grpc.WithBlock(),
   147  				grpc.WithConnectParams(grpc.ConnectParams{
   148  					Backoff: backoff.Config{
   149  						BaseDelay:  time.Second,
   150  						Multiplier: 1.1,
   151  						Jitter:     0.1,
   152  						MaxDelay:   3 * time.Second,
   153  					},
   154  					MinConnectTimeout: 3 * time.Second,
   155  				}),
   156  			),
   157  			pd.WithForwardingOption(config.EnablePDForwarding))
   158  		if err != nil {
   159  			up.err.Store(err)
   160  			return errors.Trace(err)
   161  		}
   162  
   163  		etcdCli, err := etcd.CreateRawEtcdClient(up.SecurityConfig, grpcTLSOption, up.PdEndpoints...)
   164  		if err != nil {
   165  			return errors.Trace(err)
   166  		}
   167  		up.etcdCli = etcd.Wrap(etcdCli, make(map[string]prometheus.Counter))
   168  	}
   169  	clusterID := up.PDClient.GetClusterID(ctx)
   170  	if up.ID != 0 && up.ID != clusterID {
   171  		err := fmt.Errorf("upstream id missmatch expected %d, actual: %d",
   172  			up.ID, clusterID)
   173  		up.err.Store(err)
   174  		return errors.Trace(err)
   175  	}
   176  	up.ID = clusterID
   177  
   178  	// To not block CDC server startup, we need to warn instead of error
   179  	// when TiKV is incompatible.
   180  	errorTiKVIncompatible := false
   181  	err = version.CheckClusterVersion(ctx, up.PDClient,
   182  		up.PdEndpoints, up.SecurityConfig, errorTiKVIncompatible)
   183  	if err != nil {
   184  		up.err.Store(err)
   185  		log.Error("init upstream error", zap.Error(err))
   186  		return errors.Trace(err)
   187  	}
   188  
   189  	up.KVStorage, err = kv.CreateTiStore(strings.Join(up.PdEndpoints, ","), up.SecurityConfig)
   190  	if err != nil {
   191  		up.err.Store(err)
   192  		return errors.Trace(err)
   193  	}
   194  
   195  	up.GrpcPool = kv.NewGrpcPoolImpl(ctx, up.SecurityConfig)
   196  
   197  	up.RegionCache = tikv.NewRegionCache(up.PDClient)
   198  
   199  	up.PDClock, err = pdutil.NewClock(ctx, up.PDClient)
   200  	if err != nil {
   201  		up.err.Store(err)
   202  		return errors.Trace(err)
   203  	}
   204  
   205  	up.GCManager = gc.NewManager(cfg.GCServiceID, up.PDClient, up.PDClock)
   206  
   207  	// Update meta-region label to ensure that meta region isolated from data regions.
   208  	pc, err := pdutil.NewPDAPIClient(up.PDClient, up.SecurityConfig)
   209  	if err != nil {
   210  		log.Error("create pd api client failed", zap.Error(err))
   211  		return errors.Trace(err)
   212  	}
   213  	defer pc.Close()
   214  
   215  	err = pc.UpdateMetaLabel(ctx)
   216  	if err != nil {
   217  		log.Warn("Fail to verify region label rule",
   218  			zap.Error(err),
   219  			zap.Uint64("upstreamID", up.ID),
   220  			zap.Strings("upstreamEndpoints", up.PdEndpoints))
   221  	}
   222  	err = up.registerTopologyInfo(ctx, cfg)
   223  	if err != nil {
   224  		return errors.Trace(err)
   225  	}
   226  
   227  	up.wg.Add(1)
   228  	go func() {
   229  		defer up.wg.Done()
   230  		up.PDClock.Run(ctx)
   231  	}()
   232  	up.wg.Add(1)
   233  	go func() {
   234  		defer up.wg.Done()
   235  		up.GrpcPool.RecycleConn(ctx)
   236  	}()
   237  
   238  	log.Info("upstream initialize successfully", zap.Uint64("upstreamID", up.ID))
   239  	atomic.StoreInt32(&up.status, normal)
   240  	return nil
   241  }
   242  
   243  // initGlobalConfig initializes the global config for tikv client tls.
   244  // region cache health check will use the global config.
   245  // TODO: remove this function after tikv client tls is refactored.
   246  func initGlobalConfig(secCfg *security.Credential) {
   247  	if secCfg.CAPath != "" || secCfg.CertPath != "" || secCfg.KeyPath != "" {
   248  		conf := tikvconfig.GetGlobalConfig()
   249  		conf.Security.ClusterSSLCA = secCfg.CAPath
   250  		conf.Security.ClusterSSLCert = secCfg.CertPath
   251  		conf.Security.ClusterSSLKey = secCfg.KeyPath
   252  		conf.Security.ClusterVerifyCN = secCfg.CertAllowedCN
   253  		tikvconfig.StoreGlobalConfig(conf)
   254  	}
   255  }
   256  
   257  // Close all resources.
   258  func (up *Upstream) Close() {
   259  	up.mu.Lock()
   260  	defer up.mu.Unlock()
   261  	up.cancel()
   262  	if atomic.LoadInt32(&up.status) == closed ||
   263  		atomic.LoadInt32(&up.status) == closing {
   264  		return
   265  	}
   266  	atomic.StoreInt32(&up.status, closing)
   267  
   268  	// should never close default upstream's pdClient and etcdClient here
   269  	// because it's shared in the cdc server
   270  	if !up.isDefaultUpstream {
   271  		if up.PDClient != nil {
   272  			up.PDClient.Close()
   273  		}
   274  		if up.etcdCli != nil {
   275  			err := up.etcdCli.Unwrap().Close()
   276  			if err != nil {
   277  				log.Warn("etcd client close failed", zap.Error(err))
   278  			}
   279  		}
   280  	}
   281  
   282  	if up.KVStorage != nil {
   283  		err := up.KVStorage.Close()
   284  		if err != nil {
   285  			log.Warn("kv store close failed", zap.Error(err))
   286  		}
   287  	}
   288  
   289  	if up.GrpcPool != nil {
   290  		up.GrpcPool.Close()
   291  	}
   292  	if up.RegionCache != nil {
   293  		up.RegionCache.Close()
   294  	}
   295  	if up.PDClock != nil {
   296  		up.PDClock.Stop()
   297  	}
   298  	if up.session != nil {
   299  		err := up.session.Close()
   300  		if err != nil {
   301  			log.Warn("etcd session close failed", zap.Error(err))
   302  		}
   303  	}
   304  
   305  	up.wg.Wait()
   306  	atomic.StoreInt32(&up.status, closed)
   307  	log.Info("upstream closed", zap.Uint64("upstreamID", up.ID))
   308  }
   309  
   310  // Error returns the error during init this stream
   311  func (up *Upstream) Error() error {
   312  	return up.err.Load()
   313  }
   314  
   315  // IsNormal returns true if the upstream is normal.
   316  func (up *Upstream) IsNormal() bool {
   317  	return atomic.LoadInt32(&up.status) == normal && up.err.Load() == nil
   318  }
   319  
   320  // IsClosed returns true if the upstream is closed.
   321  func (up *Upstream) IsClosed() bool {
   322  	return atomic.LoadInt32(&up.status) == closed
   323  }
   324  
   325  // resetIdleTime set the upstream idle time to true
   326  func (up *Upstream) resetIdleTime() {
   327  	up.mu.Lock()
   328  	defer up.mu.Unlock()
   329  
   330  	if !up.idleTime.IsZero() {
   331  		log.Info("upstream idle time is set to 0",
   332  			zap.Uint64("id", up.ID))
   333  		up.idleTime = time.Time{}
   334  	}
   335  }
   336  
   337  // trySetIdleTime set the upstream idle time if it's not zero
   338  func (up *Upstream) trySetIdleTime() {
   339  	up.mu.Lock()
   340  	defer up.mu.Unlock()
   341  	// reset idleTime
   342  	if up.idleTime.IsZero() {
   343  		log.Info("upstream idle time is set to current time",
   344  			zap.Uint64("id", up.ID))
   345  		up.idleTime = up.clock.Now()
   346  	}
   347  }
   348  
   349  func (up *Upstream) registerTopologyInfo(ctx context.Context, cfg CaptureTopologyCfg) error {
   350  	lease, err := up.etcdCli.Grant(ctx, cfg.SessionTTL)
   351  	if err != nil {
   352  		return errors.Trace(err)
   353  	}
   354  	up.session, err = concurrency.NewSession(up.etcdCli.Unwrap(), concurrency.WithLease(lease.ID))
   355  	if err != nil {
   356  		return errors.Trace(err)
   357  	}
   358  	// register capture info to upstream pd
   359  	key := fmt.Sprintf(topologyTiCDC, cfg.GCServiceID, cfg.AdvertiseAddr)
   360  	value, err := cfg.CaptureInfo.Marshal()
   361  	if err != nil {
   362  		return errors.Trace(err)
   363  	}
   364  	_, err = up.etcdCli.Put(ctx, key, string(value), clientv3.WithLease(up.session.Lease()))
   365  	return errors.WrapError(errors.ErrPDEtcdAPIError, err)
   366  }
   367  
   368  // shouldClose returns true if
   369  // this upstream idleTime reaches maxIdleDuration.
   370  func (up *Upstream) shouldClose() bool {
   371  	// default upstream should never be closed.
   372  	if up.isDefaultUpstream {
   373  		return false
   374  	}
   375  
   376  	if !up.idleTime.IsZero() &&
   377  		up.clock.Since(up.idleTime) >= maxIdleDuration {
   378  		return true
   379  	}
   380  
   381  	return false
   382  }
   383  
   384  // VerifyTiDBUser verify whether the username and password are valid in TiDB. It does the validation via
   385  // the successfully build of a connection with upstream TiDB with the username and password.
   386  func (up *Upstream) VerifyTiDBUser(ctx context.Context, username, password string) error {
   387  	tidbs, err := fetchTiDBTopology(ctx, up.etcdCli.Unwrap())
   388  	if err != nil {
   389  		return errors.Trace(err)
   390  	}
   391  	if len(tidbs) == 0 {
   392  		return errors.New("tidb instance not found in topology, please check if the tidb is running")
   393  	}
   394  
   395  	for _, tidb := range tidbs {
   396  		// connect tidb
   397  		host := fmt.Sprintf("%s:%d", tidb.IP, tidb.Port)
   398  		dsnStr := fmt.Sprintf("%s:%s@tcp(%s)/", username, password, host)
   399  		err = up.doVerify(ctx, dsnStr)
   400  		if err == nil {
   401  			return nil
   402  		}
   403  		if errorutil.IsAccessDeniedError(err) {
   404  			// For access denied error, we can return immediately.
   405  			// For other errors, we need to continue to verify the next tidb instance.
   406  			return errors.Trace(err)
   407  		}
   408  	}
   409  	return errors.Trace(err)
   410  }
   411  
   412  func (up *Upstream) doVerify(ctx context.Context, dsnStr string) error {
   413  	ctx, cancel := context.WithTimeout(ctx, defaultTimeout)
   414  	defer cancel()
   415  
   416  	dsn, err := dmysql.ParseDSN(dsnStr)
   417  	if err != nil {
   418  		return errors.Trace(err)
   419  	}
   420  	// Note: we use "preferred" here to make sure the connection is encrypted if possible. It is the same as the default
   421  	// behavior of mysql client, refer to: https://dev.mysql.com/doc/refman/8.0/en/using-encrypted-connections.html.
   422  	dsn.TLSConfig = "preferred"
   423  
   424  	db, err := pmysql.GetTestDB(ctx, dsn, pmysql.CreateMySQLDBConn)
   425  	if err != nil {
   426  		return errors.Trace(err)
   427  	}
   428  	defer db.Close()
   429  
   430  	rows, err := db.Query("SHOW STATUS LIKE '%Ssl_cipher'")
   431  	if err != nil {
   432  		return errors.Trace(err)
   433  	}
   434  	defer func() {
   435  		if err := rows.Close(); err != nil {
   436  			log.Warn("query Ssl_cipher close rows failed", zap.Error(err))
   437  		}
   438  		if rows.Err() != nil {
   439  			log.Warn("query Ssl_cipher rows has error", zap.Error(rows.Err()))
   440  		}
   441  	}()
   442  
   443  	var name, value string
   444  	err = rows.Scan(&name, &value)
   445  	if err != nil {
   446  		log.Warn("failed to get ssl cipher", zap.Error(err),
   447  			zap.String("username", dsn.User), zap.Uint64("upstreamID", up.ID))
   448  	}
   449  	log.Info("verify tidb user successfully", zap.String("username", dsn.User),
   450  		zap.String("sslCipherName", name), zap.String("sslCipherValue", value),
   451  		zap.Uint64("upstreamID", up.ID))
   452  	return nil
   453  }