github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/migrate/migrate.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package migrate 15 16 import ( 17 "context" 18 "encoding/json" 19 "fmt" 20 "net/url" 21 "strconv" 22 "strings" 23 "time" 24 25 "github.com/pingcap/errors" 26 "github.com/pingcap/log" 27 "github.com/pingcap/tiflow/cdc/model" 28 "github.com/pingcap/tiflow/pkg/config" 29 cerror "github.com/pingcap/tiflow/pkg/errors" 30 "github.com/pingcap/tiflow/pkg/etcd" 31 "github.com/pingcap/tiflow/pkg/pdutil" 32 "github.com/pingcap/tiflow/pkg/security" 33 "github.com/pingcap/tiflow/pkg/txnutil/gc" 34 pd "github.com/tikv/pd/client" 35 clientV3 "go.etcd.io/etcd/client/v3" 36 "go.etcd.io/etcd/client/v3/concurrency" 37 "go.uber.org/atomic" 38 "go.uber.org/zap" 39 "google.golang.org/grpc" 40 "google.golang.org/grpc/backoff" 41 ) 42 43 const ( 44 // cdcMetaVersion is hard code value indicate the metaVersion of TiCDC 45 cdcMetaVersion = 1 46 etcdSessionTTL = 10 47 campaignTimeoutDuration = 1 * time.Minute 48 noMetaVersion = -1 49 migrateLogsWarnDuration = 5 * time.Second 50 migrationCampaignKey = "ticdc-migration" 51 oldChangefeedPrefix = "/tidb/cdc/changefeed/info" 52 oldGcServiceID = "ticdc" 53 ) 54 55 type keys map[string]string 56 57 func (k keys) addPair(old, new string) { 58 k[old] = new 59 } 60 61 // Migrator migrates the cdc metadata 62 type Migrator interface { 63 // ShouldMigrate checks if we need to migrate metadata 64 ShouldMigrate(ctx context.Context) (bool, error) 65 // Migrate migrates the cdc metadata 66 Migrate(ctx context.Context) error 67 // WaitMetaVersionMatched wait util migration is done 68 WaitMetaVersionMatched(ctx context.Context) error 69 // MarkMigrateDone marks migration is done 70 MarkMigrateDone() 71 // IsMigrateDone check if migration is done 72 IsMigrateDone() bool 73 } 74 75 type migrator struct { 76 // oldMetaVersion int 77 newMetaVersion int 78 metaVersionKey string 79 // cdc old owner key 80 oldOwnerKey string 81 // etcd client 82 cli etcd.CDCEtcdClient 83 // all keyPrefixes needed to be migrated or update 84 // map from oldKeyPrefix to newKeyPrefix 85 keyPrefixes keys 86 87 done atomic.Bool 88 89 pdEndpoints []string 90 config *config.ServerConfig 91 92 createPDClientFunc func(ctx context.Context, 93 pdEndpoints []string, 94 conf *security.Credential) (pd.Client, error) 95 } 96 97 // NewMigrator returns a cdc metadata 98 func NewMigrator(cli etcd.CDCEtcdClient, 99 pdEndpoints []string, 100 serverConfig *config.ServerConfig, 101 ) Migrator { 102 metaVersionCDCKey := &etcd.CDCKey{ 103 Tp: etcd.CDCKeyTypeMetaVersion, 104 ClusterID: cli.GetClusterID(), 105 } 106 return &migrator{ 107 newMetaVersion: cdcMetaVersion, 108 metaVersionKey: metaVersionCDCKey.String(), 109 oldOwnerKey: "/ticdc/cdc/owner", 110 cli: cli, 111 keyPrefixes: make(keys), 112 pdEndpoints: pdEndpoints, 113 config: serverConfig, 114 createPDClientFunc: createPDClient, 115 } 116 } 117 118 // MarkMigrateDone marks migration is done 119 func (m *migrator) MarkMigrateDone() { 120 m.done.Store(true) 121 } 122 123 // IsMigrateDone check if migration is done 124 func (m *migrator) IsMigrateDone() bool { 125 return m.done.Load() 126 } 127 128 func createPDClient(ctx context.Context, 129 pdEndpoints []string, 130 conf *security.Credential, 131 ) (pd.Client, error) { 132 grpcTLSOption, err := conf.ToGRPCDialOption() 133 if err != nil { 134 return nil, errors.Trace(err) 135 } 136 return pd.NewClientWithContext( 137 ctx, pdEndpoints, conf.PDSecurityOption(), 138 pd.WithGRPCDialOptions( 139 grpcTLSOption, 140 grpc.WithBlock(), 141 grpc.WithConnectParams(grpc.ConnectParams{ 142 Backoff: backoff.Config{ 143 BaseDelay: time.Second, 144 Multiplier: 1.1, 145 Jitter: 0.1, 146 MaxDelay: 3 * time.Second, 147 }, 148 MinConnectTimeout: 3 * time.Second, 149 }), 150 ), 151 pd.WithForwardingOption(config.EnablePDForwarding), 152 ) 153 } 154 155 // Note: we do not use etcd transaction to migrate key 156 // as it has the maximum operation limit in a single transaction. 157 // So we use a double check mechanism to make sure the migration is complete. 158 // 1. check and put metaVersion 159 // 2. campaign old owner 160 // 3. update keys 161 // 4. check metadata consistency 162 // 5. update metaVersion 163 func (m *migrator) migrate(ctx context.Context, etcdNoMetaVersion bool, oldVersion int) error { 164 pdClient, err := m.createPDClientFunc(ctx, 165 m.pdEndpoints, m.config.Security) 166 if err != nil { 167 return errors.Trace(err) 168 } 169 defer pdClient.Close() 170 171 upstreamID := pdClient.GetClusterID(ctx) 172 // 1.1 check metaVersion, if the metaVersion in etcd does not match 173 // m.oldMetaVersion, it means that someone has migrated the metadata 174 metaVersion, err := getMetaVersion(ctx, m.cli.GetEtcdClient(), m.cli.GetClusterID()) 175 if err != nil { 176 log.Error("get meta version failed, etcd meta data migration failed", zap.Error(err)) 177 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 178 } 179 180 if metaVersion > m.newMetaVersion { 181 log.Panic("meta version in etcd is greater than the meta version in TiCDC", 182 zap.Int("etcdMetaVersion", metaVersion), zap.Int("cdcMetaVersion", m.newMetaVersion)) 183 } 184 185 // if metaVersion in etcd is equal to m.newMetaVersion, 186 // it means that there is no need to migrate 187 if !etcdNoMetaVersion && metaVersion == m.newMetaVersion { 188 log.Warn("meta version no match, no need to migrate") 189 return nil 190 } 191 192 // 1.2 put metaVersionKey to etcd to panic old version cdc server 193 if etcdNoMetaVersion { 194 _, err := m.cli.GetEtcdClient().Put(ctx, m.metaVersionKey, fmt.Sprintf("%d", oldVersion)) 195 if err != nil { 196 log.Error("put meta version failed, etcd meta data migration failed", zap.Error(err)) 197 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 198 } 199 } 200 201 // 3.campaign old owner to make sure old keys will not be updates 202 campaignCtx, cancel := context.WithTimeout(ctx, campaignTimeoutDuration) 203 defer cancel() 204 if err := m.campaignOldOwner(campaignCtx); err != nil { 205 if errors.ErrorEqual(err, context.DeadlineExceeded) { 206 log.Error("campaign old owner timeout", 207 zap.Duration("duration", campaignTimeoutDuration)) 208 } 209 log.Error("campaign old owner failed, etcd meta data migration failed", 210 zap.Error(err)) 211 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 212 } 213 214 beforeKV := make(map[string][]byte) 215 // 4.campaign owner successfully, begin to migrate data 216 for oldPrefix, newPrefix := range m.keyPrefixes { 217 resp, err := m.cli.GetEtcdClient().Get(ctx, oldPrefix, clientV3.WithPrefix()) 218 if err != nil { 219 log.Error("get old meta data failed, etcd meta data migration failed", 220 zap.Error(err)) 221 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 222 } 223 for _, v := range resp.Kvs { 224 oldKey := string(v.Key) 225 newKey := newPrefix + oldKey[len(oldPrefix):] 226 beforeKV[newKey] = v.Value 227 log.Info("migrate key", zap.String("oldKey", oldKey), zap.String("newKey", newKey)) 228 if strings.HasPrefix(string(v.Key), oldChangefeedPrefix) { 229 info := new(model.ChangeFeedInfo) 230 err = info.Unmarshal(v.Value) 231 if err != nil { 232 log.Error("unmarshal changefeed failed", 233 zap.String("value", string(v.Value)), 234 zap.Error(err)) 235 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 236 } 237 info.UpstreamID = upstreamID 238 info.Namespace = model.DefaultNamespace 239 // changefeed id is a part of etcd key path 240 // for example: /tidb/cdc/changefeed/info/abcd, abcd is the changefeed 241 info.ID = strings.TrimPrefix(string(v.Key), oldChangefeedPrefix+"/") 242 var str string 243 str, err = info.Marshal() 244 if err != nil { 245 log.Error("marshal changefeed failed", 246 zap.Error(err)) 247 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 248 } 249 _, err = m.cli.GetEtcdClient().Put(ctx, newKey, str) 250 } else { 251 _, err = m.cli.GetEtcdClient().Put(ctx, newKey, string(v.Value)) 252 } 253 if err != nil { 254 log.Error("put new meta data failed, etcd meta data migration failed", 255 zap.Error(err)) 256 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 257 } 258 } 259 } 260 // put upstream id 261 err = m.saveUpstreamInfo(ctx) 262 if err != nil { 263 log.Error("save default upstream failed, "+ 264 "etcd meta data migration failed", zap.Error(err)) 265 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 266 } 267 268 err = m.migrateGcServiceSafePoint(ctx, pdClient, 269 m.config.Security, m.cli.GetGCServiceID(), m.config.GcTTL) 270 if err != nil { 271 log.Error("update meta version failed, etcd meta data migration failed", zap.Error(err)) 272 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 273 } 274 275 // 5. update metaVersion 276 _, err = m.cli.GetEtcdClient().Put(ctx, m.metaVersionKey, fmt.Sprintf("%d", m.newMetaVersion)) 277 if err != nil { 278 log.Error("update meta version failed, etcd meta data migration failed", zap.Error(err)) 279 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 280 } 281 log.Info("etcd data migration successful") 282 cleanOldData(ctx, m.cli.GetEtcdClient()) 283 log.Info("clean old etcd data successful") 284 return nil 285 } 286 287 func cleanOldData(ctx context.Context, client *etcd.Client) { 288 resp, err := client.Get(ctx, "/tidb/cdc", clientV3.WithPrefix()) 289 if err != nil { 290 log.Warn("query data from etcd failed", 291 zap.Error(err)) 292 } 293 for _, kvPair := range resp.Kvs { 294 key := string(kvPair.Key) 295 if shouldDelete(key) { 296 value := string(kvPair.Value) 297 if strings.HasPrefix(key, oldChangefeedPrefix) { 298 value = maskChangefeedInfo(kvPair.Value) 299 } 300 // 0 is the backup version. For now, we only support version 0 301 newKey := etcd.MigrateBackupKey(0, key) 302 log.Info("renaming old etcd data", 303 zap.String("key", key), 304 zap.String("newKey", newKey), 305 zap.String("value", value)) 306 if _, err := client.Put(ctx, newKey, 307 string(kvPair.Value)); err != nil { 308 log.Info("put new key failed", zap.String("key", key), 309 zap.Error(err)) 310 } 311 if _, err := client.Delete(ctx, key); err != nil { 312 log.Warn("failed to delete old data", 313 zap.String("key", key), 314 zap.Error(err)) 315 } 316 } 317 } 318 } 319 320 // old key prefix that should be removed 321 var oldKeyPrefix = []string{ 322 "/tidb/cdc/changefeed/info", 323 "/tidb/cdc/job", 324 "/tidb/cdc/meta/ticdc-delete-etcd-key-count", 325 "/tidb/cdc/owner", 326 "/tidb/cdc/capture", 327 "/tidb/cdc/task/workload", 328 "/tidb/cdc/task/position", 329 "/tidb/cdc/task/status", 330 } 331 332 // shouldDelete check if a key should be deleted 333 func shouldDelete(key string) bool { 334 for _, prefix := range oldKeyPrefix { 335 if strings.HasPrefix(key, prefix) { 336 return true 337 } 338 } 339 return false 340 } 341 342 func maskChangefeedInfo(data []byte) string { 343 value := string(data) 344 oldConfig := map[string]any{} 345 err := json.Unmarshal(data, &oldConfig) 346 if err != nil { 347 log.Info("marshal oldConfig failed", 348 zap.Error(err)) 349 } 350 sinkURI, ok := oldConfig["sink-uri"] 351 if ok { 352 sinkURIParsed, err := url.Parse(sinkURI.(string)) 353 if err != nil { 354 log.Error("failed to parse sink URI", zap.Error(err)) 355 } 356 if sinkURIParsed.User != nil && sinkURIParsed.User.String() != "" { 357 sinkURIParsed.User = url.UserPassword("username", "password") 358 } 359 if sinkURIParsed.Host != "" { 360 sinkURIParsed.Host = "***" 361 } 362 oldConfig["sink-uri"] = sinkURIParsed.String() 363 buf, err := json.Marshal(oldConfig) 364 if err != nil { 365 log.Info("marshal oldConfig failed", 366 zap.Error(err)) 367 } 368 value = string(buf) 369 } 370 return value 371 } 372 373 func (m *migrator) migrateGcServiceSafePoint(ctx context.Context, 374 pdClient pd.Client, 375 config *security.Credential, 376 newGcServiceID string, 377 ttl int64, 378 ) error { 379 pc, err := pdutil.NewPDAPIClient(pdClient, config) 380 if err != nil { 381 log.Error("create pd api client failed", zap.Error(err)) 382 return errors.Trace(err) 383 } 384 defer pc.Close() 385 386 gcServiceSafePoints, err := pc.ListGcServiceSafePoint(ctx) 387 if err != nil { 388 log.Error("list gc service safepoint failed", 389 zap.Error(err)) 390 return errors.Trace(err) 391 } 392 var cdcGcSafePoint *pdutil.ServiceSafePoint 393 for _, item := range gcServiceSafePoints.ServiceGCSafepoints { 394 if item.ServiceID == oldGcServiceID { 395 cdcGcSafePoint = item 396 break 397 } 398 } 399 if cdcGcSafePoint != nil { 400 _, err := gc.SetServiceGCSafepoint(ctx, pdClient, newGcServiceID, 401 ttl, 402 cdcGcSafePoint.SafePoint) 403 if err != nil { 404 log.Error("set gc service safepoint failed", 405 zap.Error(err)) 406 return errors.Trace(err) 407 } 408 err = gc.RemoveServiceGCSafepoint(ctx, pdClient, oldGcServiceID) 409 if err != nil { 410 log.Warn("remove old gc safepoint failed", zap.Error(err)) 411 } 412 } 413 return nil 414 } 415 416 func (m *migrator) campaignOldOwner(ctx context.Context) error { 417 sess, err := concurrency.NewSession(m.cli.GetEtcdClient().Unwrap(), 418 concurrency.WithTTL(etcdSessionTTL)) 419 if err != nil { 420 return errors.Trace(err) 421 } 422 election := concurrency.NewElection(sess, m.oldOwnerKey) 423 defer func() { 424 _ = sess.Close() 425 }() 426 427 if err := election.Campaign(ctx, migrationCampaignKey); err != nil { 428 return errors.Trace(err) 429 } 430 return nil 431 } 432 433 // Migrate migrate etcd meta data 434 func (m *migrator) Migrate(ctx context.Context) error { 435 version, err := getMetaVersion(ctx, m.cli.GetEtcdClient(), m.cli.GetClusterID()) 436 if err != nil { 437 return errors.Trace(err) 438 } 439 440 shouldMigrate := false 441 oldVersion, newVersion := 0, cdcMetaVersion 442 443 if version == noMetaVersion { 444 if m.cli.GetClusterID() != etcd.DefaultCDCClusterID { 445 // not default cluster 446 log.Info("not a default cdc cluster, skip migration data", 447 zap.String("cluster", m.cli.GetClusterID())) 448 // put upstream id 449 err = m.saveUpstreamInfo(ctx) 450 if err != nil { 451 log.Error("save default upstream failed, "+ 452 "etcd meta data migration failed", 453 zap.Error(err)) 454 return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) 455 } 456 _, err := m.cli.GetEtcdClient(). 457 Put(ctx, m.metaVersionKey, fmt.Sprintf("%d", newVersion)) 458 if err != nil { 459 log.Error("put meta version failed", zap.Error(err)) 460 } 461 return err 462 } 463 shouldMigrate = true 464 } else if version > newVersion { 465 log.Panic("meta version in etcd is greater than the meta version in TiCDC", 466 zap.Int("etcdMetaVersion", version), zap.Int("cdcMetaVersion", m.newMetaVersion)) 467 } else { 468 oldVersion = version 469 shouldMigrate = oldVersion < newVersion 470 } 471 472 if !shouldMigrate { 473 return nil 474 } 475 476 m.keyPrefixes.addPair("/tidb/cdc/changefeed/info", 477 etcd.DefaultClusterAndNamespacePrefix+etcd.ChangefeedInfoKey) 478 m.keyPrefixes.addPair("/tidb/cdc/job", 479 etcd.DefaultClusterAndNamespacePrefix+etcd.ChangefeedStatusKey) 480 481 return m.migrate(ctx, version == noMetaVersion, oldVersion) 482 } 483 484 // ShouldMigrate checks if we should migrate etcd metadata 485 func (m *migrator) ShouldMigrate(ctx context.Context) (bool, error) { 486 version, err := getMetaVersion(ctx, m.cli.GetEtcdClient(), m.cli.GetClusterID()) 487 if err != nil { 488 return false, errors.Trace(err) 489 } 490 return version != cdcMetaVersion, nil 491 } 492 493 // WaitMetaVersionMatched checks and waits until the metaVersion in etcd 494 // matched to lock cdcMetaVersion 495 func (m *migrator) WaitMetaVersionMatched(ctx context.Context) error { 496 version, err := getMetaVersion(ctx, m.cli.GetEtcdClient(), m.cli.GetClusterID()) 497 if err != nil { 498 return errors.Trace(err) 499 } 500 if version == cdcMetaVersion { 501 return nil 502 } 503 504 ticker := time.NewTicker(time.Second) 505 defer ticker.Stop() 506 warnLogTicker := time.NewTicker(migrateLogsWarnDuration) 507 defer warnLogTicker.Stop() 508 start := time.Now() 509 for { 510 select { 511 case <-ctx.Done(): 512 return ctx.Err() 513 case <-ticker.C: 514 version, err := getMetaVersion(ctx, m.cli.GetEtcdClient(), m.cli.GetClusterID()) 515 if err != nil { 516 return errors.Trace(err) 517 } 518 if version == cdcMetaVersion { 519 return nil 520 } 521 case <-warnLogTicker.C: 522 log.Warn("meta data migrating last too long", 523 zap.Duration("duration", time.Since(start))) 524 } 525 } 526 } 527 528 // saveUpstreamInfo save the default upstream info to etcd 529 func (m *migrator) saveUpstreamInfo(ctx context.Context) error { 530 pdClient, err := m.createPDClientFunc(ctx, 531 m.pdEndpoints, m.config.Security) 532 if err != nil { 533 return errors.Trace(err) 534 } 535 defer pdClient.Close() 536 537 upstreamID := pdClient.GetClusterID(ctx) 538 upstreamKey := etcd.CDCKey{ 539 Tp: etcd.CDCKeyTypeUpStream, 540 ClusterID: m.cli.GetClusterID(), 541 UpstreamID: upstreamID, 542 Namespace: model.DefaultNamespace, 543 } 544 upstreamKeyStr := upstreamKey.String() 545 upstreamInfo := &model.UpstreamInfo{ 546 ID: upstreamID, 547 PDEndpoints: strings.Join(m.pdEndpoints, ","), 548 KeyPath: m.config.Security.KeyPath, 549 CertPath: m.config.Security.CertPath, 550 CAPath: m.config.Security.CAPath, 551 CertAllowedCN: m.config.Security.CertAllowedCN, 552 } 553 upstreamInfoStr, err := upstreamInfo.Marshal() 554 if err != nil { 555 return errors.Trace(err) 556 } 557 _, err = m.cli.GetEtcdClient().Put(ctx, upstreamKeyStr, string(upstreamInfoStr)) 558 return err 559 } 560 561 func getMetaVersion(ctx context.Context, cli *etcd.Client, clusterID string) (int, error) { 562 key := etcd.CDCKey{Tp: etcd.CDCKeyTypeMetaVersion, ClusterID: clusterID} 563 resp, err := cli.Get(ctx, key.String()) 564 if err != nil { 565 return 0, errors.Trace(err) 566 } 567 // means there no metaVersion in etcd 568 if len(resp.Kvs) == 0 { 569 return noMetaVersion, nil 570 } 571 572 version, err := strconv.Atoi(string(resp.Kvs[0].Value)) 573 if err != nil { 574 return 0, errors.Trace(err) 575 } 576 return version, nil 577 } 578 579 // NoOpMigrator do nothing 580 type NoOpMigrator struct{} 581 582 // ShouldMigrate checks if we need to migrate metadata 583 func (f *NoOpMigrator) ShouldMigrate(_ context.Context) (bool, error) { 584 return false, nil 585 } 586 587 // Migrate migrates the cdc metadata 588 func (f *NoOpMigrator) Migrate(_ context.Context) error { 589 return nil 590 } 591 592 // WaitMetaVersionMatched wait util migration is done 593 func (f *NoOpMigrator) WaitMetaVersionMatched(_ context.Context) error { 594 return nil 595 } 596 597 // MarkMigrateDone marks migration is done 598 func (f *NoOpMigrator) MarkMigrateDone() { 599 } 600 601 // IsMigrateDone check if migration is done 602 func (f *NoOpMigrator) IsMigrateDone() bool { 603 return true 604 }