github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/upstream/upstream.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package upstream 15 16 import ( 17 "context" 18 "fmt" 19 "strings" 20 "sync" 21 "sync/atomic" 22 "time" 23 24 "github.com/benbjohnson/clock" 25 dmysql "github.com/go-sql-driver/mysql" 26 "github.com/pingcap/log" 27 tidbkv "github.com/pingcap/tidb/pkg/kv" 28 "github.com/pingcap/tiflow/cdc/kv" 29 "github.com/pingcap/tiflow/pkg/config" 30 "github.com/pingcap/tiflow/pkg/errors" 31 "github.com/pingcap/tiflow/pkg/errorutil" 32 "github.com/pingcap/tiflow/pkg/etcd" 33 "github.com/pingcap/tiflow/pkg/pdutil" 34 "github.com/pingcap/tiflow/pkg/security" 35 pmysql "github.com/pingcap/tiflow/pkg/sink/mysql" 36 "github.com/pingcap/tiflow/pkg/txnutil/gc" 37 "github.com/pingcap/tiflow/pkg/version" 38 "github.com/prometheus/client_golang/prometheus" 39 tikvconfig "github.com/tikv/client-go/v2/config" 40 "github.com/tikv/client-go/v2/tikv" 41 pd "github.com/tikv/pd/client" 42 uatomic "github.com/uber-go/atomic" 43 clientv3 "go.etcd.io/etcd/client/v3" 44 "go.etcd.io/etcd/client/v3/concurrency" 45 "go.uber.org/zap" 46 "google.golang.org/grpc" 47 "google.golang.org/grpc/backoff" 48 ) 49 50 const ( 51 // indicate an upstream is created but not initialized. 52 uninit int32 = iota 53 // indicate an upstream is initialized and can work normally. 54 normal 55 // indicate an upstream is closing 56 closing 57 // indicate an upstream is closed. 58 closed 59 60 maxIdleDuration = time.Minute * 30 61 ) 62 63 // Upstream holds resources of a TiDB cluster, it can be shared by many changefeeds 64 // and processors. All public fields and method of an upstream should be thread-safe. 65 // Please be careful that never change any exported field of an Upstream. 66 type Upstream struct { 67 ID uint64 68 69 PdEndpoints []string 70 SecurityConfig *security.Credential 71 PDClient pd.Client 72 etcdCli *etcd.Client 73 session *concurrency.Session 74 75 KVStorage tidbkv.Storage 76 GrpcPool kv.GrpcPool 77 RegionCache *tikv.RegionCache 78 PDClock pdutil.Clock 79 GCManager gc.Manager 80 // Only use in Close(). 81 cancel func() 82 mu sync.Mutex 83 // record the time when Upstream.hc becomes zero. 84 idleTime time.Time 85 // use clock to facilitate unit test 86 clock clock.Clock 87 wg *sync.WaitGroup 88 status int32 89 90 err uatomic.Error 91 isDefaultUpstream bool 92 } 93 94 func newUpstream(pdEndpoints []string, 95 securityConfig *security.Credential, 96 ) *Upstream { 97 return &Upstream{ 98 PdEndpoints: pdEndpoints, 99 SecurityConfig: securityConfig, 100 status: uninit, 101 wg: new(sync.WaitGroup), 102 clock: clock.New(), 103 } 104 } 105 106 // NewUpstream4Test new an upstream for unit test. 107 func NewUpstream4Test(pdClient pd.Client) *Upstream { 108 pdClock := pdutil.NewClock4Test() 109 gcManager := gc.NewManager( 110 etcd.GcServiceIDForTest(), 111 pdClient, pdClock) 112 res := &Upstream{ 113 ID: testUpstreamID, 114 PDClient: pdClient, 115 PDClock: pdClock, 116 GCManager: gcManager, 117 status: normal, 118 wg: new(sync.WaitGroup), 119 clock: clock.New(), 120 SecurityConfig: &security.Credential{}, 121 cancel: func() {}, 122 } 123 124 return res 125 } 126 127 // init initializes the upstream 128 func initUpstream(ctx context.Context, up *Upstream, cfg CaptureTopologyCfg) error { 129 ctx, up.cancel = context.WithCancel(ctx) 130 grpcTLSOption, err := up.SecurityConfig.ToGRPCDialOption() 131 if err != nil { 132 up.err.Store(err) 133 return errors.Trace(err) 134 } 135 // init the tikv client tls global config 136 initGlobalConfig(up.SecurityConfig) 137 // default upstream always use the pdClient pass from cdc server 138 if !up.isDefaultUpstream { 139 up.PDClient, err = pd.NewClientWithContext( 140 ctx, up.PdEndpoints, up.SecurityConfig.PDSecurityOption(), 141 // the default `timeout` is 3s, maybe too small if the pd is busy, 142 // set to 10s to avoid frequent timeout. 143 pd.WithCustomTimeoutOption(10*time.Second), 144 pd.WithGRPCDialOptions( 145 grpcTLSOption, 146 grpc.WithBlock(), 147 grpc.WithConnectParams(grpc.ConnectParams{ 148 Backoff: backoff.Config{ 149 BaseDelay: time.Second, 150 Multiplier: 1.1, 151 Jitter: 0.1, 152 MaxDelay: 3 * time.Second, 153 }, 154 MinConnectTimeout: 3 * time.Second, 155 }), 156 ), 157 pd.WithForwardingOption(config.EnablePDForwarding)) 158 if err != nil { 159 up.err.Store(err) 160 return errors.Trace(err) 161 } 162 163 etcdCli, err := etcd.CreateRawEtcdClient(up.SecurityConfig, grpcTLSOption, up.PdEndpoints...) 164 if err != nil { 165 return errors.Trace(err) 166 } 167 up.etcdCli = etcd.Wrap(etcdCli, make(map[string]prometheus.Counter)) 168 } 169 clusterID := up.PDClient.GetClusterID(ctx) 170 if up.ID != 0 && up.ID != clusterID { 171 err := fmt.Errorf("upstream id missmatch expected %d, actual: %d", 172 up.ID, clusterID) 173 up.err.Store(err) 174 return errors.Trace(err) 175 } 176 up.ID = clusterID 177 178 // To not block CDC server startup, we need to warn instead of error 179 // when TiKV is incompatible. 180 errorTiKVIncompatible := false 181 err = version.CheckClusterVersion(ctx, up.PDClient, 182 up.PdEndpoints, up.SecurityConfig, errorTiKVIncompatible) 183 if err != nil { 184 up.err.Store(err) 185 log.Error("init upstream error", zap.Error(err)) 186 return errors.Trace(err) 187 } 188 189 up.KVStorage, err = kv.CreateTiStore(strings.Join(up.PdEndpoints, ","), up.SecurityConfig) 190 if err != nil { 191 up.err.Store(err) 192 return errors.Trace(err) 193 } 194 195 up.GrpcPool = kv.NewGrpcPoolImpl(ctx, up.SecurityConfig) 196 197 up.RegionCache = tikv.NewRegionCache(up.PDClient) 198 199 up.PDClock, err = pdutil.NewClock(ctx, up.PDClient) 200 if err != nil { 201 up.err.Store(err) 202 return errors.Trace(err) 203 } 204 205 up.GCManager = gc.NewManager(cfg.GCServiceID, up.PDClient, up.PDClock) 206 207 // Update meta-region label to ensure that meta region isolated from data regions. 208 pc, err := pdutil.NewPDAPIClient(up.PDClient, up.SecurityConfig) 209 if err != nil { 210 log.Error("create pd api client failed", zap.Error(err)) 211 return errors.Trace(err) 212 } 213 defer pc.Close() 214 215 err = pc.UpdateMetaLabel(ctx) 216 if err != nil { 217 log.Warn("Fail to verify region label rule", 218 zap.Error(err), 219 zap.Uint64("upstreamID", up.ID), 220 zap.Strings("upstreamEndpoints", up.PdEndpoints)) 221 } 222 err = up.registerTopologyInfo(ctx, cfg) 223 if err != nil { 224 return errors.Trace(err) 225 } 226 227 up.wg.Add(1) 228 go func() { 229 defer up.wg.Done() 230 up.PDClock.Run(ctx) 231 }() 232 up.wg.Add(1) 233 go func() { 234 defer up.wg.Done() 235 up.GrpcPool.RecycleConn(ctx) 236 }() 237 238 log.Info("upstream initialize successfully", zap.Uint64("upstreamID", up.ID)) 239 atomic.StoreInt32(&up.status, normal) 240 return nil 241 } 242 243 // initGlobalConfig initializes the global config for tikv client tls. 244 // region cache health check will use the global config. 245 // TODO: remove this function after tikv client tls is refactored. 246 func initGlobalConfig(secCfg *security.Credential) { 247 if secCfg.CAPath != "" || secCfg.CertPath != "" || secCfg.KeyPath != "" { 248 conf := tikvconfig.GetGlobalConfig() 249 conf.Security.ClusterSSLCA = secCfg.CAPath 250 conf.Security.ClusterSSLCert = secCfg.CertPath 251 conf.Security.ClusterSSLKey = secCfg.KeyPath 252 conf.Security.ClusterVerifyCN = secCfg.CertAllowedCN 253 tikvconfig.StoreGlobalConfig(conf) 254 } 255 } 256 257 // Close all resources. 258 func (up *Upstream) Close() { 259 up.mu.Lock() 260 defer up.mu.Unlock() 261 up.cancel() 262 if atomic.LoadInt32(&up.status) == closed || 263 atomic.LoadInt32(&up.status) == closing { 264 return 265 } 266 atomic.StoreInt32(&up.status, closing) 267 268 // should never close default upstream's pdClient and etcdClient here 269 // because it's shared in the cdc server 270 if !up.isDefaultUpstream { 271 if up.PDClient != nil { 272 up.PDClient.Close() 273 } 274 if up.etcdCli != nil { 275 err := up.etcdCli.Unwrap().Close() 276 if err != nil { 277 log.Warn("etcd client close failed", zap.Error(err)) 278 } 279 } 280 } 281 282 if up.KVStorage != nil { 283 err := up.KVStorage.Close() 284 if err != nil { 285 log.Warn("kv store close failed", zap.Error(err)) 286 } 287 } 288 289 if up.GrpcPool != nil { 290 up.GrpcPool.Close() 291 } 292 if up.RegionCache != nil { 293 up.RegionCache.Close() 294 } 295 if up.PDClock != nil { 296 up.PDClock.Stop() 297 } 298 if up.session != nil { 299 err := up.session.Close() 300 if err != nil { 301 log.Warn("etcd session close failed", zap.Error(err)) 302 } 303 } 304 305 up.wg.Wait() 306 atomic.StoreInt32(&up.status, closed) 307 log.Info("upstream closed", zap.Uint64("upstreamID", up.ID)) 308 } 309 310 // Error returns the error during init this stream 311 func (up *Upstream) Error() error { 312 return up.err.Load() 313 } 314 315 // IsNormal returns true if the upstream is normal. 316 func (up *Upstream) IsNormal() bool { 317 return atomic.LoadInt32(&up.status) == normal && up.err.Load() == nil 318 } 319 320 // IsClosed returns true if the upstream is closed. 321 func (up *Upstream) IsClosed() bool { 322 return atomic.LoadInt32(&up.status) == closed 323 } 324 325 // resetIdleTime set the upstream idle time to true 326 func (up *Upstream) resetIdleTime() { 327 up.mu.Lock() 328 defer up.mu.Unlock() 329 330 if !up.idleTime.IsZero() { 331 log.Info("upstream idle time is set to 0", 332 zap.Uint64("id", up.ID)) 333 up.idleTime = time.Time{} 334 } 335 } 336 337 // trySetIdleTime set the upstream idle time if it's not zero 338 func (up *Upstream) trySetIdleTime() { 339 up.mu.Lock() 340 defer up.mu.Unlock() 341 // reset idleTime 342 if up.idleTime.IsZero() { 343 log.Info("upstream idle time is set to current time", 344 zap.Uint64("id", up.ID)) 345 up.idleTime = up.clock.Now() 346 } 347 } 348 349 func (up *Upstream) registerTopologyInfo(ctx context.Context, cfg CaptureTopologyCfg) error { 350 lease, err := up.etcdCli.Grant(ctx, cfg.SessionTTL) 351 if err != nil { 352 return errors.Trace(err) 353 } 354 up.session, err = concurrency.NewSession(up.etcdCli.Unwrap(), concurrency.WithLease(lease.ID)) 355 if err != nil { 356 return errors.Trace(err) 357 } 358 // register capture info to upstream pd 359 key := fmt.Sprintf(topologyTiCDC, cfg.GCServiceID, cfg.AdvertiseAddr) 360 value, err := cfg.CaptureInfo.Marshal() 361 if err != nil { 362 return errors.Trace(err) 363 } 364 _, err = up.etcdCli.Put(ctx, key, string(value), clientv3.WithLease(up.session.Lease())) 365 return errors.WrapError(errors.ErrPDEtcdAPIError, err) 366 } 367 368 // shouldClose returns true if 369 // this upstream idleTime reaches maxIdleDuration. 370 func (up *Upstream) shouldClose() bool { 371 // default upstream should never be closed. 372 if up.isDefaultUpstream { 373 return false 374 } 375 376 if !up.idleTime.IsZero() && 377 up.clock.Since(up.idleTime) >= maxIdleDuration { 378 return true 379 } 380 381 return false 382 } 383 384 // VerifyTiDBUser verify whether the username and password are valid in TiDB. It does the validation via 385 // the successfully build of a connection with upstream TiDB with the username and password. 386 func (up *Upstream) VerifyTiDBUser(ctx context.Context, username, password string) error { 387 tidbs, err := fetchTiDBTopology(ctx, up.etcdCli.Unwrap()) 388 if err != nil { 389 return errors.Trace(err) 390 } 391 if len(tidbs) == 0 { 392 return errors.New("tidb instance not found in topology, please check if the tidb is running") 393 } 394 395 for _, tidb := range tidbs { 396 // connect tidb 397 host := fmt.Sprintf("%s:%d", tidb.IP, tidb.Port) 398 dsnStr := fmt.Sprintf("%s:%s@tcp(%s)/", username, password, host) 399 err = up.doVerify(ctx, dsnStr) 400 if err == nil { 401 return nil 402 } 403 if errorutil.IsAccessDeniedError(err) { 404 // For access denied error, we can return immediately. 405 // For other errors, we need to continue to verify the next tidb instance. 406 return errors.Trace(err) 407 } 408 } 409 return errors.Trace(err) 410 } 411 412 func (up *Upstream) doVerify(ctx context.Context, dsnStr string) error { 413 ctx, cancel := context.WithTimeout(ctx, defaultTimeout) 414 defer cancel() 415 416 dsn, err := dmysql.ParseDSN(dsnStr) 417 if err != nil { 418 return errors.Trace(err) 419 } 420 // Note: we use "preferred" here to make sure the connection is encrypted if possible. It is the same as the default 421 // behavior of mysql client, refer to: https://dev.mysql.com/doc/refman/8.0/en/using-encrypted-connections.html. 422 dsn.TLSConfig = "preferred" 423 424 db, err := pmysql.GetTestDB(ctx, dsn, pmysql.CreateMySQLDBConn) 425 if err != nil { 426 return errors.Trace(err) 427 } 428 defer db.Close() 429 430 rows, err := db.Query("SHOW STATUS LIKE '%Ssl_cipher'") 431 if err != nil { 432 return errors.Trace(err) 433 } 434 defer func() { 435 if err := rows.Close(); err != nil { 436 log.Warn("query Ssl_cipher close rows failed", zap.Error(err)) 437 } 438 if rows.Err() != nil { 439 log.Warn("query Ssl_cipher rows has error", zap.Error(rows.Err())) 440 } 441 }() 442 443 var name, value string 444 err = rows.Scan(&name, &value) 445 if err != nil { 446 log.Warn("failed to get ssl cipher", zap.Error(err), 447 zap.String("username", dsn.User), zap.Uint64("upstreamID", up.ID)) 448 } 449 log.Info("verify tidb user successfully", zap.String("username", dsn.User), 450 zap.String("sslCipherName", name), zap.String("sslCipherValue", value), 451 zap.Uint64("upstreamID", up.ID)) 452 return nil 453 }