github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/processor/manager.go (about) 1 // Copyright 2021 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package processor 15 16 import ( 17 "context" 18 "fmt" 19 "io" 20 "time" 21 22 "github.com/pingcap/errors" 23 "github.com/pingcap/failpoint" 24 "github.com/pingcap/log" 25 "github.com/pingcap/tiflow/cdc/model" 26 "github.com/pingcap/tiflow/cdc/vars" 27 "github.com/pingcap/tiflow/pkg/config" 28 cerror "github.com/pingcap/tiflow/pkg/errors" 29 "github.com/pingcap/tiflow/pkg/etcd" 30 "github.com/pingcap/tiflow/pkg/orchestrator" 31 "github.com/pingcap/tiflow/pkg/upstream" 32 "github.com/prometheus/client_golang/prometheus" 33 "go.uber.org/zap" 34 ) 35 36 type commandTp int 37 38 const ( 39 commandTpUnknown commandTp = iota 40 commandTpWriteDebugInfo 41 processorLogsWarnDuration = 1 * time.Second 42 ) 43 44 type command struct { 45 tp commandTp 46 payload interface{} 47 done chan<- error 48 } 49 50 // Manager is a manager of processor, which maintains the state and behavior of processors 51 type Manager interface { 52 orchestrator.Reactor 53 54 // Close the manager itself and all processors. Can't be called with `Tick` concurrently. 55 // After it's called, all other methods shouldn't be called any more. 56 Close() 57 58 WriteDebugInfo(ctx context.Context, w io.Writer, done chan<- error) 59 } 60 61 // managerImpl is a manager of processor, which maintains the state and behavior of processors 62 type managerImpl struct { 63 captureInfo *model.CaptureInfo 64 liveness *model.Liveness 65 processors map[model.ChangeFeedID]*processor 66 commandQueue chan *command 67 upstreamManager *upstream.Manager 68 69 newProcessor func( 70 *model.ChangeFeedInfo, 71 *model.ChangeFeedStatus, 72 *model.CaptureInfo, 73 model.ChangeFeedID, 74 *upstream.Upstream, 75 *model.Liveness, 76 uint64, 77 *config.SchedulerConfig, 78 etcd.OwnerCaptureInfoClient, 79 *vars.GlobalVars, 80 ) *processor 81 cfg *config.SchedulerConfig 82 globalVars *vars.GlobalVars 83 84 metricProcessorCloseDuration prometheus.Observer 85 } 86 87 // NewManager creates a new processor manager 88 func NewManager( 89 captureInfo *model.CaptureInfo, 90 upstreamManager *upstream.Manager, 91 liveness *model.Liveness, 92 cfg *config.SchedulerConfig, 93 globalVars *vars.GlobalVars, 94 ) Manager { 95 return &managerImpl{ 96 captureInfo: captureInfo, 97 liveness: liveness, 98 processors: make(map[model.ChangeFeedID]*processor), 99 commandQueue: make(chan *command, 4), 100 upstreamManager: upstreamManager, 101 newProcessor: NewProcessor, 102 metricProcessorCloseDuration: processorCloseDuration, 103 cfg: cfg, 104 globalVars: globalVars, 105 } 106 } 107 108 // Tick implements the `orchestrator.State` interface 109 // the `state` parameter is sent by the etcd worker, the `state` must be a snapshot of KVs in etcd 110 // the Tick function of Manager create or remove processor instances according to the specified `state`, or pass the `state` to processor instances 111 func (m *managerImpl) Tick(stdCtx context.Context, state orchestrator.ReactorState) (nextState orchestrator.ReactorState, err error) { 112 globalState := state.(*orchestrator.GlobalReactorState) 113 m.handleCommand() 114 115 var inactiveChangefeedCount int 116 for changefeedID, changefeedState := range globalState.Changefeeds { 117 if !changefeedState.Active(m.captureInfo.ID) { 118 inactiveChangefeedCount++ 119 m.closeProcessor(changefeedID) 120 continue 121 } 122 currentChangefeedEpoch := changefeedState.Info.Epoch 123 p, exist := m.processors[changefeedID] 124 if !exist { 125 up, ok := m.upstreamManager.Get(changefeedState.Info.UpstreamID) 126 if !ok { 127 upstreamInfo := globalState.Upstreams[changefeedState.Info.UpstreamID] 128 up = m.upstreamManager.AddUpstream(upstreamInfo) 129 } 130 failpoint.Inject("processorManagerHandleNewChangefeedDelay", nil) 131 132 cfg := *m.cfg 133 cfg.ChangefeedSettings = changefeedState.Info.Config.Scheduler 134 p = m.newProcessor( 135 changefeedState.Info, changefeedState.Status, 136 m.captureInfo, changefeedID, up, m.liveness, 137 currentChangefeedEpoch, &cfg, m.globalVars.EtcdClient, 138 m.globalVars) 139 m.processors[changefeedID] = p 140 } 141 if currentChangefeedEpoch != p.changefeedEpoch { 142 // Changefeed has restarted due to error, the processor is stale. 143 m.closeProcessor(changefeedID) 144 continue 145 } 146 // check if the changefeed is normal before tick 147 if !checkChangefeedNormal(changefeedState) { 148 patchProcessorErr(p.captureInfo, changefeedState, 149 cerror.ErrAdminStopProcessor.GenWithStackByArgs()) 150 m.closeProcessor(changefeedID) 151 continue 152 } 153 // check the capture is alive 154 changefeedState.CheckCaptureAlive(p.captureInfo.ID) 155 // check if the task position is created 156 if createTaskPosition(changefeedState, p.captureInfo) { 157 continue 158 } 159 err, warning := p.Tick(stdCtx, changefeedState.Info, changefeedState.Status) 160 if warning != nil { 161 patchProcessorWarning(p.captureInfo, changefeedState, warning) 162 } 163 if err != nil { 164 patchProcessorErr(p.captureInfo, changefeedState, err) 165 // patchProcessorErr have already patched its error to tell the owner 166 // manager can just close the processor and continue to tick other processors 167 m.closeProcessor(changefeedID) 168 } 169 } 170 // check if the processors in memory is leaked 171 if len(globalState.Changefeeds)-inactiveChangefeedCount != len(m.processors) { 172 for changefeedID := range m.processors { 173 if _, exist := globalState.Changefeeds[changefeedID]; !exist { 174 m.closeProcessor(changefeedID) 175 } 176 } 177 } 178 179 if err := m.upstreamManager.Tick(stdCtx, globalState); err != nil { 180 return state, errors.Trace(err) 181 } 182 return state, nil 183 } 184 185 // checkChangefeedNormal checks if the changefeed is runnable. 186 func checkChangefeedNormal(changefeed *orchestrator.ChangefeedReactorState) bool { 187 // check the state in this tick, make sure that the admin job type of the changefeed is not stopped 188 if changefeed.Info.AdminJobType.IsStopState() || changefeed.Status.AdminJobType.IsStopState() { 189 return false 190 } 191 // add a patch to check the changefeed is runnable when applying the patches in the etcd worker. 192 changefeed.CheckChangefeedNormal() 193 return true 194 } 195 196 // createTaskPosition will create a new task position if a task position does not exist. 197 // task position not exist only when the processor is running first in the first tick. 198 func createTaskPosition(changefeed *orchestrator.ChangefeedReactorState, 199 captureInfo *model.CaptureInfo, 200 ) (skipThisTick bool) { 201 if _, exist := changefeed.TaskPositions[captureInfo.ID]; exist { 202 return false 203 } 204 changefeed.PatchTaskPosition(captureInfo.ID, 205 func(position *model.TaskPosition) (*model.TaskPosition, bool, error) { 206 if position == nil { 207 return &model.TaskPosition{}, true, nil 208 } 209 return position, false, nil 210 }) 211 return true 212 } 213 214 func patchProcessorErr(captureInfo *model.CaptureInfo, 215 changefeed *orchestrator.ChangefeedReactorState, 216 err error, 217 ) { 218 if isProcessorIgnorableError(err) { 219 log.Info("processor exited", 220 zap.String("capture", captureInfo.ID), 221 zap.String("namespace", changefeed.ID.Namespace), 222 zap.String("changefeed", changefeed.ID.ID), 223 zap.Error(err)) 224 return 225 } 226 // record error information in etcd 227 var code string 228 if rfcCode, ok := cerror.RFCCode(err); ok { 229 code = string(rfcCode) 230 } else { 231 code = string(cerror.ErrProcessorUnknown.RFCCode()) 232 } 233 changefeed.PatchTaskPosition(captureInfo.ID, 234 func(position *model.TaskPosition) (*model.TaskPosition, bool, error) { 235 if position == nil { 236 position = &model.TaskPosition{} 237 } 238 position.Error = &model.RunningError{ 239 Time: time.Now(), 240 Addr: captureInfo.AdvertiseAddr, 241 Code: code, 242 Message: err.Error(), 243 } 244 return position, true, nil 245 }) 246 log.Error("run processor failed", 247 zap.String("capture", captureInfo.ID), 248 zap.String("namespace", changefeed.ID.Namespace), 249 zap.String("changefeed", changefeed.ID.ID), 250 zap.Error(err)) 251 } 252 253 func patchProcessorWarning(captureInfo *model.CaptureInfo, 254 changefeed *orchestrator.ChangefeedReactorState, err error, 255 ) { 256 if err == nil { 257 return 258 } 259 var code string 260 if rfcCode, ok := cerror.RFCCode(err); ok { 261 code = string(rfcCode) 262 } else { 263 code = string(cerror.ErrProcessorUnknown.RFCCode()) 264 } 265 changefeed.PatchTaskPosition(captureInfo.ID, 266 func(position *model.TaskPosition) (*model.TaskPosition, bool, error) { 267 if position == nil { 268 position = &model.TaskPosition{} 269 } 270 position.Warning = &model.RunningError{ 271 Time: time.Now(), 272 Addr: captureInfo.AdvertiseAddr, 273 Code: code, 274 Message: err.Error(), 275 } 276 return position, true, nil 277 }) 278 } 279 280 func (m *managerImpl) closeProcessor(changefeedID model.ChangeFeedID) { 281 processor, exist := m.processors[changefeedID] 282 if exist { 283 startTime := time.Now() 284 err := processor.Close() 285 costTime := time.Since(startTime) 286 if costTime > processorLogsWarnDuration { 287 log.Warn("processor close took too long", 288 zap.String("namespace", changefeedID.Namespace), 289 zap.String("changefeed", changefeedID.ID), 290 zap.String("capture", m.captureInfo.ID), 291 zap.Duration("duration", costTime)) 292 } 293 m.metricProcessorCloseDuration.Observe(costTime.Seconds()) 294 if err != nil { 295 log.Warn("failed to close processor", 296 zap.String("namespace", changefeedID.Namespace), 297 zap.String("changefeed", changefeedID.ID), 298 zap.Error(err)) 299 } 300 delete(m.processors, changefeedID) 301 } 302 } 303 304 // Close the manager itself and all processors. 305 // Note: This method must not be called with `Tick`. Please be careful. 306 func (m *managerImpl) Close() { 307 log.Info("processor.Manager is closing") 308 for changefeedID := range m.processors { 309 m.closeProcessor(changefeedID) 310 } 311 // FIXME: we should drain command queue and signal callers an error. 312 } 313 314 // WriteDebugInfo write the debug info to Writer 315 func (m *managerImpl) WriteDebugInfo( 316 ctx context.Context, w io.Writer, done chan<- error, 317 ) { 318 err := m.sendCommand(ctx, commandTpWriteDebugInfo, w, done) 319 if err != nil { 320 log.Warn("send command commandTpWriteDebugInfo failed", zap.Error(err)) 321 } 322 } 323 324 // sendCommands sends command to manager. 325 // `done` is closed upon command completion or sendCommand returns error. 326 func (m *managerImpl) sendCommand( 327 ctx context.Context, tp commandTp, payload interface{}, done chan<- error, 328 ) error { 329 cmd := &command{tp: tp, payload: payload, done: done} 330 select { 331 case <-ctx.Done(): 332 close(done) 333 return errors.Trace(ctx.Err()) 334 case m.commandQueue <- cmd: 335 // FIXME: signal EtcdWorker to handle commands ASAP. 336 } 337 return nil 338 } 339 340 func (m *managerImpl) handleCommand() { 341 var cmd *command 342 select { 343 case cmd = <-m.commandQueue: 344 default: 345 return 346 } 347 defer close(cmd.done) 348 switch cmd.tp { 349 case commandTpWriteDebugInfo: 350 w := cmd.payload.(io.Writer) 351 err := m.writeDebugInfo(w) 352 if err != nil { 353 cmd.done <- err 354 } 355 default: 356 log.Warn("Unknown command in processor manager", zap.Any("command", cmd)) 357 } 358 } 359 360 func (m *managerImpl) writeDebugInfo(w io.Writer) error { 361 for changefeedID, processor := range m.processors { 362 fmt.Fprintf(w, "changefeedID: %s\n", changefeedID) 363 err := processor.WriteDebugInfo(w) 364 if err != nil { 365 return errors.Trace(err) 366 } 367 fmt.Fprintf(w, "\n") 368 } 369 370 return nil 371 }