github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/orchestrator/etcd_worker.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package orchestrator 15 16 import ( 17 "context" 18 "fmt" 19 "strconv" 20 "time" 21 22 "github.com/pingcap/errors" 23 "github.com/pingcap/failpoint" 24 "github.com/pingcap/log" 25 "github.com/pingcap/tiflow/pkg/chdelay" 26 cerrors "github.com/pingcap/tiflow/pkg/errors" 27 "github.com/pingcap/tiflow/pkg/etcd" 28 "github.com/pingcap/tiflow/pkg/migrate" 29 "github.com/pingcap/tiflow/pkg/orchestrator/util" 30 pkgutil "github.com/pingcap/tiflow/pkg/util" 31 "github.com/prometheus/client_golang/prometheus" 32 "go.etcd.io/etcd/api/v3/etcdserverpb" 33 "go.etcd.io/etcd/api/v3/mvccpb" 34 "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" 35 clientv3 "go.etcd.io/etcd/client/v3" 36 "go.etcd.io/etcd/client/v3/concurrency" 37 "go.uber.org/zap" 38 "go.uber.org/zap/zapcore" 39 "golang.org/x/time/rate" 40 ) 41 42 const ( 43 // When EtcdWorker commits a txn to etcd or ticks its reactor 44 // takes more than etcdWorkerLogsWarnDuration, it will print a log 45 etcdWorkerLogsWarnDuration = 1 * time.Second 46 ) 47 48 // EtcdWorker handles all interactions with Etcd 49 type EtcdWorker struct { 50 clusterID string 51 client *etcd.Client 52 reactor Reactor 53 state ReactorState 54 // rawState is the local cache of the latest Etcd state. 55 rawState map[util.EtcdKey]rawStateEntry 56 // pendingUpdates stores Etcd updates that the Reactor has not been notified of. 57 pendingUpdates []*etcdUpdate 58 // revision is the Etcd revision of the latest event received from Etcd 59 // (which has not necessarily been applied to the ReactorState) 60 revision int64 61 // reactor.Tick() should not be called until revision >= barrierRev. 62 barrierRev int64 63 // prefix is the scope of Etcd watch 64 prefix util.EtcdPrefix 65 // deleteCounter maintains a local copy of a value stored in Etcd used to 66 // keep track of how many deletes have been committed by an EtcdWorker 67 // watching this key prefix. 68 // This mechanism is necessary as a workaround to correctly detect 69 // write-write conflicts when at least a transaction commits a delete, 70 // because deletes in Etcd reset the mod-revision of keys, making it 71 // difficult to use it as a unique version identifier to implement 72 // a `compare-and-swap` semantics, which is essential for implementing 73 // snapshot isolation for Reactor ticks. 74 deleteCounter int64 75 metrics *etcdWorkerMetrics 76 77 migrator migrate.Migrator 78 ownerMetaKey string 79 isOwner bool 80 } 81 82 type etcdWorkerMetrics struct { 83 // kv events related metrics 84 metricEtcdTxnSize prometheus.Observer 85 metricEtcdTxnDuration prometheus.Observer 86 metricEtcdWorkerTickDuration prometheus.Observer 87 } 88 89 type etcdUpdate struct { 90 key util.EtcdKey 91 value []byte 92 revision int64 93 } 94 95 // rawStateEntry stores the latest version of a key as seen by the EtcdWorker. 96 // modRevision is stored to implement `compare-and-swap` semantics more reliably. 97 type rawStateEntry struct { 98 value []byte 99 modRevision int64 100 } 101 102 // NewEtcdWorker returns a new EtcdWorker 103 func NewEtcdWorker( 104 client etcd.CDCEtcdClient, 105 prefix string, 106 reactor Reactor, 107 initState ReactorState, 108 migrator migrate.Migrator, 109 ) (*EtcdWorker, error) { 110 return &EtcdWorker{ 111 clusterID: client.GetClusterID(), 112 client: client.GetEtcdClient(), 113 reactor: reactor, 114 state: initState, 115 rawState: make(map[util.EtcdKey]rawStateEntry), 116 prefix: util.NormalizePrefix(prefix), 117 barrierRev: -1, // -1 indicates no barrier 118 migrator: migrator, 119 }, nil 120 } 121 122 func (worker *EtcdWorker) initMetrics() { 123 metrics := &etcdWorkerMetrics{} 124 metrics.metricEtcdTxnSize = etcdTxnSize 125 metrics.metricEtcdTxnDuration = etcdTxnExecDuration 126 metrics.metricEtcdWorkerTickDuration = etcdWorkerTickDuration 127 worker.metrics = metrics 128 } 129 130 // Run starts the EtcdWorker event loop. 131 // A tick is generated either on a timer whose interval is timerInterval, or on an Etcd event. 132 // If the specified etcd session is Done, this Run function will exit with cerrors.ErrEtcdSessionDone. 133 // And the specified etcd session is nil-safety. 134 func (worker *EtcdWorker) Run(ctx context.Context, session *concurrency.Session, timerInterval time.Duration, role string) error { 135 defer worker.cleanUp() 136 worker.isOwner = role == pkgutil.RoleController.String() 137 // migrate data here 138 err := worker.checkAndMigrateMetaData(ctx, role) 139 if err != nil { 140 return errors.Trace(err) 141 } 142 // migration is done, cdc server can serve http now 143 worker.migrator.MarkMigrateDone() 144 145 worker.initMetrics() 146 147 err = worker.syncRawState(ctx) 148 if err != nil { 149 return errors.Trace(err) 150 } 151 152 ticker := time.NewTicker(timerInterval) 153 defer ticker.Stop() 154 155 watchCtx, cancel := context.WithCancel(ctx) 156 defer cancel() 157 watchCh := worker.client.Watch(watchCtx, worker.prefix.String(), role, clientv3.WithPrefix(), clientv3.WithRev(worker.revision+1)) 158 159 if role == pkgutil.RoleProcessor.String() { 160 failpoint.Inject("ProcessorEtcdDelay", func() { 161 delayer := chdelay.NewChannelDelayer(time.Second*3, watchCh, 1024, 16) 162 defer delayer.Close() 163 watchCh = delayer.Out() 164 }) 165 } 166 167 var ( 168 pendingPatches [][]DataPatch 169 committedChanges int 170 exiting bool 171 retry bool 172 sessionDone <-chan struct{} 173 ) 174 if session != nil { 175 sessionDone = session.Done() 176 worker.ownerMetaKey = fmt.Sprintf("%s/%x", 177 etcd.CaptureOwnerKey(worker.clusterID), session.Lease()) 178 } else { 179 // should never be closed 180 sessionDone = make(chan struct{}) 181 } 182 183 // limit the number of times EtcdWorker can tick 184 rl := rate.NewLimiter(rate.Every(timerInterval), 2) 185 for { 186 select { 187 case <-ctx.Done(): 188 return ctx.Err() 189 case <-sessionDone: 190 return cerrors.ErrEtcdSessionDone.GenWithStackByArgs() 191 case <-ticker.C: 192 // There is no new event to handle on timer ticks, so we have nothing here. 193 case response := <-watchCh: 194 // In this select case, we receive new events from Etcd, and call handleEvent if appropriate. 195 if err := response.Err(); err != nil { 196 return errors.Trace(err) 197 } 198 199 // ProgressNotify implies no new events. 200 if response.IsProgressNotify() { 201 log.Debug("Etcd progress notification", 202 zap.Int64("revision", response.Header.GetRevision())) 203 // Note that we don't need to update the revision here, and we 204 // should not do so, because the revision of the progress notification 205 // may not satisfy the strict monotonicity we have expected. 206 // 207 // Updating `worker.revision` can cause a useful event with the 208 // same revision to be dropped erroneously. 209 // 210 // Refer to https://etcd.io/docs/v3.3/dev-guide/interacting_v3/#watch-progress 211 // "Note: The revision number in the progress notify response is the revision 212 // from the local etcd server node that the watch stream is connected to. [...]" 213 // This implies that the progress notification will NOT go through the raft 214 // consensus, thereby NOT affecting the revision (index). 215 continue 216 } 217 218 // Check whether the response is stale. 219 if worker.revision >= response.Header.GetRevision() { 220 log.Info("Stale Etcd event dropped", 221 zap.Int64("eventRevision", response.Header.GetRevision()), 222 zap.Int64("previousRevision", worker.revision), 223 zap.Any("events", response.Events), 224 zap.String("role", role)) 225 continue 226 } 227 worker.revision = response.Header.GetRevision() 228 229 for _, event := range response.Events { 230 // handleEvent will apply the event to our internal `rawState`. 231 err = worker.handleEvent(ctx, event) 232 if err != nil { 233 // This error means owner is resigned by itself, 234 // and we should exit etcd worker and campaign owner again. 235 return err 236 } 237 } 238 239 } 240 241 tryCommitPendingPatches := func() (bool, error) { 242 if len(pendingPatches) > 0 { 243 // Here we have some patches yet to be uploaded to Etcd. 244 pendingPatches, committedChanges, err = worker.applyPatchGroups(ctx, pendingPatches) 245 if isRetryableError(err) { 246 return true, nil 247 } 248 if err != nil { 249 return false, errors.Trace(err) 250 } 251 // pendingPatches size may greater than 0, but nothing to commit to etcd 252 // if nothing is changed, we should not break this tick 253 if committedChanges > 0 { 254 return true, nil 255 } 256 } 257 return false, nil 258 } 259 if retry, err = tryCommitPendingPatches(); err != nil { 260 return err 261 } 262 if exiting { 263 // If exiting is true here, it means that the reactor returned `ErrReactorFinished` last tick, 264 // and all pending patches is applied. 265 return nil 266 } 267 if retry { 268 continue 269 } 270 if worker.revision < worker.barrierRev { 271 // We hold off notifying the Reactor because barrierRev has not been reached. 272 // This usually happens when a committed write Txn has not been received by Watch. 273 continue 274 } 275 276 // We are safe to update the ReactorState only if there is no pending patch. 277 if err := worker.applyUpdates(); err != nil { 278 return errors.Trace(err) 279 } 280 281 // If !rl.Allow(), skip this Tick to avoid etcd worker tick reactor too frequency. 282 // It makes etcdWorker to batch etcd changed event in worker.state. 283 // The semantics of `ReactorState` requires that any implementation 284 // can batch updates internally. 285 if !rl.Allow() { 286 continue 287 } 288 startTime := time.Now() 289 // it is safe that a batch of updates has been applied to worker.state before worker.reactor.Tick 290 nextState, err := worker.reactor.Tick(ctx, worker.state) 291 costTime := time.Since(startTime) 292 if costTime > etcdWorkerLogsWarnDuration { 293 log.Warn("EtcdWorker reactor tick took too long", 294 zap.Duration("duration", costTime), 295 zap.String("role", role)) 296 } 297 worker.metrics.metricEtcdWorkerTickDuration.Observe(costTime.Seconds()) 298 if err != nil { 299 if !cerrors.ErrReactorFinished.Equal(errors.Cause(err)) { 300 return errors.Trace(err) 301 } 302 // normal exit 303 exiting = true 304 } 305 worker.state = nextState 306 pendingPatches = append(pendingPatches, nextState.GetPatches()...) 307 // apply pending patches 308 if _, err = tryCommitPendingPatches(); err != nil { 309 return err 310 } 311 } 312 } 313 314 func isRetryableError(err error) bool { 315 err = errors.Cause(err) 316 if cerrors.ErrEtcdTryAgain.Equal(err) || 317 context.DeadlineExceeded == err { 318 return true 319 } 320 // When encountering an abnormal connection with etcd, the worker will keep retrying 321 // until the session is done. 322 _, ok := err.(rpctypes.EtcdError) 323 return ok 324 } 325 326 func (worker *EtcdWorker) handleEvent(_ context.Context, event *clientv3.Event) error { 327 if worker.isDeleteCounterKey(event.Kv.Key) { 328 switch event.Type { 329 case mvccpb.PUT: 330 worker.handleDeleteCounter(event.Kv.Value) 331 case mvccpb.DELETE: 332 log.Warn("deletion counter key deleted", zap.Reflect("event", event)) 333 worker.handleDeleteCounter(nil) 334 } 335 // We return here because the delete-counter is not used for business logic, 336 // and it should not be exposed further to the Reactor. 337 return nil 338 } 339 340 worker.pendingUpdates = append(worker.pendingUpdates, &etcdUpdate{ 341 key: util.NewEtcdKeyFromBytes(event.Kv.Key), 342 value: event.Kv.Value, 343 revision: event.Kv.ModRevision, 344 }) 345 346 switch event.Type { 347 case mvccpb.PUT: 348 value := event.Kv.Value 349 if value == nil { 350 value = []byte{} 351 } 352 worker.rawState[util.NewEtcdKeyFromBytes(event.Kv.Key)] = rawStateEntry{ 353 value: value, 354 modRevision: event.Kv.ModRevision, 355 } 356 case mvccpb.DELETE: 357 delete(worker.rawState, util.NewEtcdKeyFromBytes(event.Kv.Key)) 358 if string(event.Kv.Key) == worker.ownerMetaKey { 359 if worker.isOwner { 360 err := cerrors.ErrNotOwner.GenWithStackByArgs() 361 log.Error("owner key is delete, it may causes by "+ 362 "owner resign or externally delete operation"+ 363 "exit etcd worker and campaign again", 364 zap.String("ownerMetaKey", worker.ownerMetaKey), 365 zap.String("value", string(event.Kv.Value)), 366 zap.Error(err)) 367 return err 368 } 369 } 370 } 371 return nil 372 } 373 374 func (worker *EtcdWorker) syncRawState(ctx context.Context) error { 375 resp, err := worker.client.Get(ctx, worker.prefix.String(), clientv3.WithPrefix()) 376 if err != nil { 377 return errors.Trace(err) 378 } 379 380 worker.rawState = make(map[util.EtcdKey]rawStateEntry) 381 for _, kv := range resp.Kvs { 382 if worker.isDeleteCounterKey(kv.Key) { 383 worker.handleDeleteCounter(kv.Value) 384 continue 385 } 386 key := util.NewEtcdKeyFromBytes(kv.Key) 387 worker.rawState[key] = rawStateEntry{ 388 value: kv.Value, 389 modRevision: kv.ModRevision, 390 } 391 err := worker.state.Update(key, kv.Value, true) 392 if err != nil { 393 return errors.Trace(err) 394 } 395 } 396 397 worker.revision = resp.Header.Revision 398 return nil 399 } 400 401 func (worker *EtcdWorker) cloneRawState() map[util.EtcdKey][]byte { 402 ret := make(map[util.EtcdKey][]byte) 403 for k, v := range worker.rawState { 404 vCloned := make([]byte, len(v.value)) 405 copy(vCloned, v.value) 406 ret[util.NewEtcdKey(k.String())] = vCloned 407 } 408 return ret 409 } 410 411 func (worker *EtcdWorker) applyPatchGroups(ctx context.Context, patchGroups [][]DataPatch) ([][]DataPatch, int, error) { 412 state := worker.cloneRawState() 413 committedChanges := 0 414 for len(patchGroups) > 0 { 415 changeSate, n, size, err := getBatchChangedState(state, patchGroups) 416 if err != nil { 417 return patchGroups, committedChanges, err 418 } 419 committedChanges += len(changeSate) 420 err = worker.commitChangedState(ctx, changeSate, size) 421 if err != nil { 422 return patchGroups, committedChanges, err 423 } 424 patchGroups = patchGroups[n:] 425 } 426 return patchGroups, committedChanges, nil 427 } 428 429 func (worker *EtcdWorker) commitChangedState(ctx context.Context, changedState map[util.EtcdKey][]byte, size int) error { 430 if len(changedState) == 0 { 431 return nil 432 } 433 434 cmps := make([]clientv3.Cmp, 0, len(changedState)) 435 opsThen := make([]clientv3.Op, 0, len(changedState)) 436 hasDelete := false 437 438 for key, value := range changedState { 439 // make sure someone else has not updated the key after the last snapshot 440 var cmp clientv3.Cmp 441 if entry, ok := worker.rawState[key]; ok { 442 cmp = clientv3.Compare(clientv3.ModRevision(key.String()), "=", entry.modRevision) 443 } else { 444 // if ok is false, it means that the key of this patch is not exist in a committed state 445 // this compare is equivalent to `patch.Key` is not exist 446 cmp = clientv3.Compare(clientv3.ModRevision(key.String()), "=", 0) 447 } 448 cmps = append(cmps, cmp) 449 450 var op clientv3.Op 451 if value != nil { 452 op = clientv3.OpPut(key.String(), string(value)) 453 } else { 454 op = clientv3.OpDelete(key.String()) 455 hasDelete = true 456 } 457 opsThen = append(opsThen, op) 458 } 459 460 if hasDelete { 461 opsThen = append(opsThen, clientv3.OpPut(worker.prefix.String()+etcd.DeletionCounterKey, 462 fmt.Sprint(worker.deleteCounter+1))) 463 } 464 if worker.deleteCounter > 0 { 465 cmps = append(cmps, clientv3.Compare(clientv3.Value(worker.prefix.String()+ 466 etcd.DeletionCounterKey), 467 "=", fmt.Sprint(worker.deleteCounter))) 468 } else if worker.deleteCounter == 0 { 469 cmps = append(cmps, clientv3.Compare(clientv3.CreateRevision(worker.prefix.String()+ 470 etcd.DeletionCounterKey), 471 "=", 0)) 472 } else { 473 panic("unreachable") 474 } 475 476 worker.metrics.metricEtcdTxnSize.Observe(float64(size)) 477 startTime := time.Now() 478 resp, err := worker.client.Txn(ctx, cmps, opsThen, etcd.TxnEmptyOpsElse) 479 480 // For testing the situation where we have a progress notification that 481 // has the same revision as the committed Etcd transaction. 482 failpoint.Inject("InjectProgressRequestAfterCommit", func() { 483 if err := worker.client.RequestProgress(ctx); err != nil { 484 failpoint.Return(errors.Trace(err)) 485 } 486 }) 487 488 costTime := time.Since(startTime) 489 if costTime > etcdWorkerLogsWarnDuration { 490 log.Warn("Etcd transaction took too long", zap.Duration("duration", costTime)) 491 } 492 worker.metrics.metricEtcdTxnDuration.Observe(costTime.Seconds()) 493 if err != nil { 494 return errors.Trace(err) 495 } 496 497 logEtcdOps(opsThen, resp.Succeeded) 498 if resp.Succeeded { 499 worker.barrierRev = resp.Header.GetRevision() 500 return nil 501 } 502 503 // Logs the conditions for the failed Etcd transaction. 504 worker.logEtcdCmps(cmps) 505 return cerrors.ErrEtcdTryAgain.GenWithStackByArgs() 506 } 507 508 func (worker *EtcdWorker) applyUpdates() error { 509 for _, update := range worker.pendingUpdates { 510 err := worker.state.Update(update.key, update.value, false) 511 if err != nil { 512 return errors.Trace(err) 513 } 514 } 515 worker.state.UpdatePendingChange() 516 517 worker.pendingUpdates = worker.pendingUpdates[:0] 518 return nil 519 } 520 521 func logEtcdOps(ops []clientv3.Op, committed bool) { 522 if committed && (log.GetLevel() != zapcore.DebugLevel || len(ops) == 0) { 523 return 524 } 525 logFn := log.Debug 526 if !committed { 527 logFn = log.Info 528 } 529 530 logFn("[etcd worker] ==========Update State to ETCD==========") 531 for _, op := range ops { 532 if op.IsDelete() { 533 logFn("[etcd worker] delete key", zap.ByteString("key", op.KeyBytes())) 534 } else { 535 logFn("[etcd worker] put key", zap.ByteString("key", op.KeyBytes()), zap.ByteString("value", op.ValueBytes())) 536 } 537 } 538 logFn("[etcd worker] ============State Commit=============", zap.Bool("committed", committed)) 539 } 540 541 func (worker *EtcdWorker) logEtcdCmps(cmps []clientv3.Cmp) { 542 log.Info("[etcd worker] ==========Failed Etcd Txn Cmps==========") 543 for _, cmp := range cmps { 544 cmp := etcdserverpb.Compare(cmp) 545 log.Info("[etcd worker] compare", 546 zap.String("cmp", cmp.String())) 547 } 548 log.Info("[etcd worker] ============End Failed Etcd Txn Cmps=============") 549 } 550 551 func (worker *EtcdWorker) cleanUp() { 552 worker.rawState = nil 553 worker.revision = 0 554 worker.pendingUpdates = worker.pendingUpdates[:0] 555 } 556 557 func (worker *EtcdWorker) isDeleteCounterKey(key []byte) bool { 558 return string(key) == worker.prefix.String()+etcd.DeletionCounterKey 559 } 560 561 func (worker *EtcdWorker) handleDeleteCounter(value []byte) { 562 if len(value) == 0 { 563 // The delete counter key has been deleted, resetting the internal counter 564 worker.deleteCounter = 0 565 return 566 } 567 568 var err error 569 worker.deleteCounter, err = strconv.ParseInt(string(value), 10, 64) 570 if err != nil { 571 // This should never happen unless Etcd server has been tampered with. 572 log.Panic("strconv failed. Unexpected Etcd state.", zap.Error(err)) 573 } 574 if worker.deleteCounter <= 0 { 575 log.Panic("unexpected delete counter", zap.Int64("value", worker.deleteCounter)) 576 } 577 } 578 579 // checkAndMigrateMetaData check if we should migrate meta, if true, it will block 580 // until migrate done 581 func (worker *EtcdWorker) checkAndMigrateMetaData( 582 ctx context.Context, role string, 583 ) error { 584 shouldMigrate, err := worker.migrator.ShouldMigrate(ctx) 585 if err != nil { 586 return errors.Trace(err) 587 } 588 if !shouldMigrate { 589 return nil 590 } 591 592 if role != pkgutil.RoleController.String() { 593 err := worker.migrator.WaitMetaVersionMatched(ctx) 594 return errors.Trace(err) 595 } 596 597 err = worker.migrator.Migrate(ctx) 598 return err 599 }