github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/pkg/ha/stage.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package ha 15 16 import ( 17 "context" 18 "encoding/json" 19 "fmt" 20 21 "github.com/pingcap/tiflow/dm/common" 22 "github.com/pingcap/tiflow/dm/config" 23 "github.com/pingcap/tiflow/dm/pb" 24 "github.com/pingcap/tiflow/dm/pkg/etcdutil" 25 "github.com/pingcap/tiflow/dm/pkg/log" 26 "github.com/pingcap/tiflow/dm/pkg/terror" 27 "go.etcd.io/etcd/api/v3/mvccpb" 28 clientv3 "go.etcd.io/etcd/client/v3" 29 "go.uber.org/zap" 30 ) 31 32 // Stage represents the running stage for a relay or subtask. 33 type Stage struct { 34 Expect pb.Stage `json:"expect"` // the expectant stage. 35 Source string `json:"source"` // the source ID of the upstream. 36 Task string `json:"task,omitempty"` // the task name for subtask; empty for relay. 37 38 // only used to report to the caller of the watcher, do not marsh it. 39 // if it's true, it means the stage has been deleted in etcd. 40 IsDeleted bool `json:"-"` 41 // record the etcd Revision of this Stage 42 Revision int64 `json:"-"` 43 } 44 45 // NewRelayStage creates a new Stage instance for relay. 46 func NewRelayStage(expect pb.Stage, source string) Stage { 47 return newStage(expect, source, "") 48 } 49 50 // NewSubTaskStage creates a new Stage instance for subtask. 51 func NewSubTaskStage(expect pb.Stage, source, task string) Stage { 52 return newStage(expect, source, task) 53 } 54 55 func NewValidatorStage(expect pb.Stage, source, task string) Stage { 56 return newStage(expect, source, task) 57 } 58 59 // newStage creates a new Stage instance. 60 func newStage(expect pb.Stage, source, task string) Stage { 61 return Stage{ 62 Expect: expect, 63 Source: source, 64 Task: task, 65 } 66 } 67 68 // String implements Stringer interface. 69 func (s Stage) String() string { 70 str, _ := s.toJSON() 71 return str 72 } 73 74 // toJSON returns the string of JSON represent. 75 func (s Stage) toJSON() (string, error) { 76 data, err := json.Marshal(s) 77 if err != nil { 78 return "", terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("failed to marshal stage %+v", s)) 79 } 80 return string(data), nil 81 } 82 83 // IsEmpty returns true when this Stage has no value. 84 func (s Stage) IsEmpty() bool { 85 var emptyStage Stage 86 return s == emptyStage 87 } 88 89 // stageFromJSON constructs Stage from its JSON represent. 90 func stageFromJSON(str string) (s Stage, err error) { 91 if err = json.Unmarshal([]byte(str), &s); err != nil { 92 err = terror.ErrHAInvalidItem.Delegate(err, fmt.Sprintf("failed to unmarshal stage %s", str)) 93 } 94 return 95 } 96 97 // PutRelayStage puts the stage of the relay into etcd. 98 // k/v: sourceID -> the running stage of the relay. 99 func PutRelayStage(cli *clientv3.Client, stages ...Stage) (int64, error) { 100 ops, err := putRelayStageOp(stages...) 101 if err != nil { 102 return 0, err 103 } 104 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 105 return rev, err 106 } 107 108 // DeleteRelayStage deleted the relay stage of this source. 109 func DeleteRelayStage(cli *clientv3.Client, source string) (int64, error) { 110 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(deleteRelayStageOp(source))) 111 return rev, err 112 } 113 114 // PutSubTaskStage puts the stage of the subtask into etcd. 115 // k/v: sourceID, task -> the running stage of the subtask. 116 func PutSubTaskStage(cli *clientv3.Client, stages ...Stage) (int64, error) { 117 ops, err := putSubTaskStageOp(stages...) 118 if err != nil { 119 return 0, err 120 } 121 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 122 return rev, err 123 } 124 125 // GetRelayStage gets the relay stage for the specified upstream source. 126 // if the stage for the source not exist, return with `err == nil` and `revision=0`. 127 func GetRelayStage(cli *clientv3.Client, source string) (Stage, int64, error) { 128 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 129 defer cancel() 130 131 var stage Stage 132 resp, err := cli.Get(ctx, common.StageRelayKeyAdapter.Encode(source)) 133 if err != nil { 134 return stage, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get relay stage for source %s", source) 135 } 136 137 if resp.Count == 0 { 138 return stage, resp.Header.Revision, nil 139 } else if resp.Count > 1 { 140 // this should not happen. 141 return stage, 0, terror.ErrConfigMoreThanOne.Generate(resp.Count, "relay stage", "source: "+source) 142 } 143 144 stage, err = stageFromJSON(string(resp.Kvs[0].Value)) 145 if err != nil { 146 return stage, 0, err 147 } 148 stage.Revision = resp.Kvs[0].ModRevision 149 150 return stage, resp.Header.Revision, nil 151 } 152 153 // GetAllRelayStage gets all relay stages. 154 // k/v: source ID -> relay stage. 155 func GetAllRelayStage(cli *clientv3.Client) (map[string]Stage, int64, error) { 156 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 157 defer cancel() 158 159 resp, err := cli.Get(ctx, common.StageRelayKeyAdapter.Path(), clientv3.WithPrefix()) 160 if err != nil { 161 return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get all relay stages") 162 } 163 164 stages := make(map[string]Stage) 165 for _, kv := range resp.Kvs { 166 stage, err2 := stageFromJSON(string(kv.Value)) 167 if err2 != nil { 168 return nil, 0, err2 169 } 170 stage.Revision = kv.ModRevision 171 stages[stage.Source] = stage 172 } 173 return stages, resp.Header.Revision, nil 174 } 175 176 // GetSubTaskStage gets the subtask stage for the specified upstream source and task name. 177 // if the stage for the source and task name not exist, return with `err == nil` and `revision=0`. 178 // if task name is "", it will return all subtasks' stage as a map{task-name: stage} for the source. 179 // if task name is given, it will return a map{task-name: stage} whose length is 1. 180 func GetSubTaskStage(cli *clientv3.Client, source, task string) (map[string]Stage, int64, error) { 181 return getStageByKey(cli, common.StageSubTaskKeyAdapter, source, task, 0) 182 } 183 184 func getStageByKey(cli *clientv3.Client, key common.KeyAdapter, source, task string, revision int64) (map[string]Stage, int64, error) { 185 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 186 defer cancel() 187 188 var ( 189 stm = make(map[string]Stage) 190 resp *clientv3.GetResponse 191 err error 192 opts = make([]clientv3.OpOption, 0) 193 ) 194 if revision > 0 { 195 opts = append(opts, clientv3.WithRev(revision)) 196 } 197 if task != "" { 198 resp, err = cli.Get(ctx, key.Encode(source, task), opts...) 199 } else { 200 opts = append(opts, clientv3.WithPrefix()) 201 resp, err = cli.Get(ctx, key.Encode(source), opts...) 202 } 203 204 if err != nil { 205 return stm, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get subtask stage for source %s, task %s", source, task) 206 } 207 208 stages, err := getStagesFromResp(source, task, resp) 209 if err != nil { 210 return stm, 0, err 211 } 212 stm = stages[source] 213 214 return stm, resp.Header.Revision, nil 215 } 216 217 func GetValidatorStage(cli *clientv3.Client, source, task string, revision int64) (map[string]Stage, int64, error) { 218 return getStageByKey(cli, common.StageValidatorKeyAdapter, source, task, revision) 219 } 220 221 // GetAllSubTaskStage gets all subtask stages. 222 // k/v: source ID -> task name -> subtask stage. 223 func GetAllSubTaskStage(cli *clientv3.Client) (map[string]map[string]Stage, int64, error) { 224 return getAllStagesInner(cli, common.StageSubTaskKeyAdapter) 225 } 226 227 func getAllStagesInner(cli *clientv3.Client, key common.KeyAdapter) (map[string]map[string]Stage, int64, error) { 228 ctx, cancel := context.WithTimeout(cli.Ctx(), etcdutil.DefaultRequestTimeout) 229 defer cancel() 230 231 resp, err := cli.Get(ctx, key.Path(), clientv3.WithPrefix()) 232 if err != nil { 233 return nil, 0, terror.ErrHAFailTxnOperation.Delegate(err, "failed to get all subtask stages") 234 } 235 236 stages, err := getStagesFromResp("", "", resp) 237 if err != nil { 238 return nil, 0, err 239 } 240 241 return stages, resp.Header.Revision, nil 242 } 243 244 func GetAllValidatorStage(cli *clientv3.Client) (map[string]map[string]Stage, int64, error) { 245 return getAllStagesInner(cli, common.StageValidatorKeyAdapter) 246 } 247 248 // GetSubTaskStageConfig gets source's subtask stages and configs at the same time 249 // source **must not be empty** 250 // return map{task name -> subtask stage}, map{task name -> validator stage}, map{task name -> subtask config}, revision, error. 251 func GetSubTaskStageConfig(cli *clientv3.Client, source string) (map[string]Stage, map[string]Stage, map[string]config.SubTaskConfig, int64, error) { 252 var ( 253 stm = make(map[string]Stage) 254 validatorStageMap = make(map[string]Stage) 255 scm = make(map[string]config.SubTaskConfig) 256 ) 257 txnResp, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc( 258 clientv3.OpGet(common.StageSubTaskKeyAdapter.Encode(source), clientv3.WithPrefix()), 259 clientv3.OpGet(common.StageValidatorKeyAdapter.Encode(source), clientv3.WithPrefix()), 260 clientv3.OpGet(common.UpstreamSubTaskKeyAdapter.Encode(source), clientv3.WithPrefix()))) 261 if err != nil { 262 return stm, validatorStageMap, scm, 0, err 263 } 264 stageResp := txnResp.Responses[0].GetResponseRange() 265 stages, err := getStagesFromResp(source, "", (*clientv3.GetResponse)(stageResp)) 266 if err != nil { 267 return stm, validatorStageMap, scm, 0, err 268 } 269 stm = stages[source] 270 271 validatorStageResp := txnResp.Responses[1].GetResponseRange() 272 validatorStages, err := getStagesFromResp(source, "", (*clientv3.GetResponse)(validatorStageResp)) 273 if err != nil { 274 return stm, validatorStageMap, scm, 0, err 275 } 276 validatorStageMap = validatorStages[source] 277 278 cfgResp := txnResp.Responses[2].GetResponseRange() 279 cfgs, err := subTaskCfgFromResp(source, "", (*clientv3.GetResponse)(cfgResp)) 280 if err != nil { 281 return stm, validatorStageMap, scm, 0, err 282 } 283 scm = cfgs[source] 284 285 return stm, validatorStageMap, scm, rev, err 286 } 287 288 // WatchRelayStage watches PUT & DELETE operations for the relay stage. 289 // for the DELETE stage, it returns an empty stage. 290 func WatchRelayStage(ctx context.Context, cli *clientv3.Client, 291 source string, revision int64, outCh chan<- Stage, errCh chan<- error, 292 ) { 293 wCtx, cancel := context.WithCancel(ctx) 294 defer cancel() 295 ch := cli.Watch(wCtx, common.StageRelayKeyAdapter.Encode(source), clientv3.WithRev(revision)) 296 watchStage(ctx, ch, relayStageFromKey, outCh, errCh) 297 } 298 299 // WatchSubTaskStage watches PUT & DELETE operations for the subtask stage. 300 // for the DELETE stage, it returns an empty stage. 301 func WatchSubTaskStage(ctx context.Context, cli *clientv3.Client, 302 source string, revision int64, outCh chan<- Stage, errCh chan<- error, 303 ) { 304 wCtx, cancel := context.WithCancel(ctx) 305 defer cancel() 306 ch := cli.Watch(wCtx, common.StageSubTaskKeyAdapter.Encode(source), clientv3.WithPrefix(), clientv3.WithRev(revision)) 307 watchStage(ctx, ch, subTaskStageFromKey, outCh, errCh) 308 } 309 310 func WatchValidatorStage(ctx context.Context, cli *clientv3.Client, 311 source string, rev int64, outCh chan<- Stage, errCh chan<- error, 312 ) { 313 wCtx, cancel := context.WithCancel(ctx) 314 defer cancel() 315 ch := cli.Watch(wCtx, common.StageValidatorKeyAdapter.Encode(source), clientv3.WithPrefix(), clientv3.WithRev(rev)) 316 watchStage(ctx, ch, validatorStageFromKey, outCh, errCh) 317 } 318 319 // DeleteSubTaskStage deletes the subtask stage. 320 func DeleteSubTaskStage(cli *clientv3.Client, stages ...Stage) (int64, error) { 321 ops := deleteSubTaskStageOp(stages...) 322 _, rev, err := etcdutil.DoTxnWithRepeatable(cli, etcdutil.ThenOpFunc(ops...)) 323 return rev, err 324 } 325 326 // relayStageFromKey constructs an incomplete relay stage from an etcd key. 327 func relayStageFromKey(key string) (Stage, error) { 328 var stage Stage 329 ks, err := common.StageRelayKeyAdapter.Decode(key) 330 if err != nil { 331 return stage, err 332 } 333 stage.Source = ks[0] 334 return stage, nil 335 } 336 337 // subTaskStageFromKey constructs an incomplete subtask stage from an etcd key. 338 func subTaskStageFromKey(key string) (Stage, error) { 339 var stage Stage 340 ks, err := common.StageSubTaskKeyAdapter.Decode(key) 341 if err != nil { 342 return stage, err 343 } 344 stage.Source = ks[0] 345 stage.Task = ks[1] 346 return stage, nil 347 } 348 349 func validatorStageFromKey(key string) (Stage, error) { 350 var stage Stage 351 ks, err := common.StageValidatorKeyAdapter.Decode(key) 352 if err != nil { 353 return stage, err 354 } 355 stage.Source = ks[0] 356 stage.Task = ks[1] 357 return stage, nil 358 } 359 360 func getStagesFromResp(source, task string, resp *clientv3.GetResponse) (map[string]map[string]Stage, error) { 361 stages := make(map[string]map[string]Stage) 362 if source != "" { 363 stages[source] = make(map[string]Stage) // avoid stages[source] is nil 364 } 365 366 if resp.Count == 0 { 367 return stages, nil 368 } else if source != "" && task != "" && resp.Count > 1 { 369 // this should not happen. 370 return stages, terror.ErrConfigMoreThanOne.Generate(resp.Count, "stage", "(source "+source+", task "+task+")") 371 } 372 373 for _, kvs := range resp.Kvs { 374 stage, err := stageFromJSON(string(kvs.Value)) 375 if err != nil { 376 return nil, err 377 } 378 if _, ok := stages[stage.Source]; !ok { 379 stages[stage.Source] = make(map[string]Stage) 380 } 381 stage.Revision = kvs.ModRevision 382 stages[stage.Source][stage.Task] = stage 383 } 384 return stages, nil 385 } 386 387 // watchStage watches PUT & DELETE operations for the stage. 388 // nolint:dupl 389 func watchStage(ctx context.Context, watchCh clientv3.WatchChan, 390 stageFromKey func(key string) (Stage, error), outCh chan<- Stage, errCh chan<- error, 391 ) { 392 for { 393 select { 394 case <-ctx.Done(): 395 return 396 case resp, ok := <-watchCh: 397 if !ok { 398 return 399 } 400 if resp.Canceled { 401 // TODO(csuzhangxc): do retry here. 402 if resp.Err() != nil { 403 select { 404 case errCh <- terror.ErrHAFailWatchEtcd.Delegate(resp.Err(), "watch stage canceled"): 405 case <-ctx.Done(): 406 } 407 } 408 return 409 } 410 411 for _, ev := range resp.Events { 412 var ( 413 stage Stage 414 err error 415 ) 416 switch ev.Type { 417 case mvccpb.PUT: 418 stage, err = stageFromJSON(string(ev.Kv.Value)) 419 case mvccpb.DELETE: 420 stage, err = stageFromKey(string(ev.Kv.Key)) 421 stage.IsDeleted = true 422 default: 423 // this should not happen. 424 log.L().Error("unsupported etcd event type", zap.Reflect("kv", ev.Kv), zap.Reflect("type", ev.Type)) 425 continue 426 } 427 stage.Revision = ev.Kv.ModRevision 428 429 if err != nil { 430 select { 431 case errCh <- err: 432 case <-ctx.Done(): 433 return 434 } 435 } else { 436 select { 437 case outCh <- stage: 438 case <-ctx.Done(): 439 return 440 } 441 } 442 } 443 } 444 } 445 } 446 447 // putRelayStageOp returns a list of PUT etcd operation for the relay stage. 448 // k/v: sourceID -> the running stage of the relay. 449 func putRelayStageOp(stages ...Stage) ([]clientv3.Op, error) { 450 ops := make([]clientv3.Op, 0, len(stages)) 451 for _, stage := range stages { 452 value, err := stage.toJSON() 453 if err != nil { 454 return ops, err 455 } 456 key := common.StageRelayKeyAdapter.Encode(stage.Source) 457 ops = append(ops, clientv3.OpPut(key, value)) 458 } 459 return ops, nil 460 } 461 462 // putSubTaskStageOp returns a list of PUT etcd operations for the subtask stage. 463 // k/v: sourceID, task -> the running stage of the subtask. 464 func putSubTaskStageOp(stages ...Stage) ([]clientv3.Op, error) { 465 ops := make([]clientv3.Op, 0, len(stages)) 466 for _, stage := range stages { 467 value, err := stage.toJSON() 468 if err != nil { 469 return ops, err 470 } 471 key := common.StageSubTaskKeyAdapter.Encode(stage.Source, stage.Task) 472 ops = append(ops, clientv3.OpPut(key, value)) 473 } 474 return ops, nil 475 } 476 477 // deleteRelayStageOp returns a DELETE etcd operation for the relay stage. 478 func deleteRelayStageOp(source string) clientv3.Op { 479 return clientv3.OpDelete(common.StageRelayKeyAdapter.Encode(source)) 480 } 481 482 // deleteSubTaskStageOp returns a list of DELETE etcd operation for the subtask stage. 483 func deleteSubTaskStageOp(stages ...Stage) []clientv3.Op { 484 ops := make([]clientv3.Op, 0, len(stages)) 485 for _, stage := range stages { 486 ops = append(ops, clientv3.OpDelete(common.StageSubTaskKeyAdapter.Encode(stage.Source, stage.Task))) 487 } 488 return ops 489 }