github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/dml_worker.go (about) 1 // Copyright 2021 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package syncer 15 16 import ( 17 "strings" 18 "time" 19 20 "github.com/pingcap/errors" 21 "github.com/pingcap/failpoint" 22 tcontext "github.com/pingcap/tiflow/dm/pkg/context" 23 "github.com/pingcap/tiflow/dm/pkg/log" 24 "github.com/pingcap/tiflow/dm/pkg/terror" 25 "github.com/pingcap/tiflow/dm/pkg/utils" 26 "github.com/pingcap/tiflow/dm/syncer/dbconn" 27 "github.com/pingcap/tiflow/dm/syncer/metrics" 28 "github.com/pingcap/tiflow/pkg/sqlmodel" 29 "go.uber.org/zap" 30 ) 31 32 // DMLWorker is used to sync dml. 33 type DMLWorker struct { 34 compact bool 35 batch int 36 workerCount int 37 chanSize int 38 multipleRows bool 39 toDBConns []*dbconn.DBConn 40 syncCtx *tcontext.Context 41 logger log.Logger 42 metricProxies *metrics.Proxies 43 44 // for MetricsProxies 45 task string 46 source string 47 worker string 48 49 // callback func 50 // TODO: refine callback func 51 successFunc func(int, int, []*job) 52 fatalFunc func(*job, error) 53 lagFunc func(*job, int) 54 updateJobMetricsFunc func(bool, string, *job) 55 56 // channel 57 inCh chan *job 58 flushCh chan *job 59 } 60 61 // dmlWorkerWrap creates and runs a dmlWorker instance and returns flush job channel. 62 func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job { 63 chanSize := syncer.cfg.QueueSize / 2 64 if syncer.cfg.Compact { 65 chanSize /= 2 66 } 67 dmlWorker := &DMLWorker{ 68 compact: syncer.cfg.Compact, 69 batch: syncer.cfg.Batch, 70 workerCount: syncer.cfg.WorkerCount, 71 chanSize: chanSize, 72 multipleRows: syncer.cfg.MultipleRows, 73 task: syncer.cfg.Name, 74 source: syncer.cfg.SourceID, 75 worker: syncer.cfg.WorkerName, 76 logger: syncer.tctx.Logger.WithFields(zap.String("component", "dml_worker")), 77 successFunc: syncer.successFunc, 78 fatalFunc: syncer.fatalFunc, 79 lagFunc: syncer.updateReplicationJobTS, 80 updateJobMetricsFunc: syncer.updateJobMetrics, 81 syncCtx: syncer.syncCtx, // this ctx can be used to cancel all the workers 82 metricProxies: syncer.metricsProxies, 83 toDBConns: syncer.toDBConns, 84 inCh: inCh, 85 flushCh: make(chan *job), 86 } 87 88 go func() { 89 dmlWorker.run() 90 dmlWorker.close() 91 }() 92 return dmlWorker.flushCh 93 } 94 95 // close closes outer channel. 96 func (w *DMLWorker) close() { 97 close(w.flushCh) 98 } 99 100 // run distribute jobs by queueBucket. 101 func (w *DMLWorker) run() { 102 jobChs := make([]chan *job, w.workerCount) 103 104 for i := 0; i < w.workerCount; i++ { 105 jobChs[i] = make(chan *job, w.chanSize) 106 go w.executeJobs(i, jobChs[i]) 107 } 108 109 defer func() { 110 for i := 0; i < w.workerCount; i++ { 111 close(jobChs[i]) 112 } 113 }() 114 115 queueBucketMapping := make([]string, w.workerCount) 116 for i := 0; i < w.workerCount; i++ { 117 queueBucketMapping[i] = queueBucketName(i) 118 } 119 for j := range w.inCh { 120 w.metricProxies.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh))) 121 switch j.tp { 122 case flush: 123 w.updateJobMetricsFunc(false, adminQueueName, j) 124 w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) 125 j.flushWg.Wait() 126 w.updateJobMetricsFunc(true, adminQueueName, j) 127 w.flushCh <- j 128 case asyncFlush: 129 w.updateJobMetricsFunc(false, adminQueueName, j) 130 w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) 131 w.flushCh <- j 132 case conflict: 133 w.updateJobMetricsFunc(false, adminQueueName, j) 134 w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) 135 j.flushWg.Wait() 136 w.updateJobMetricsFunc(true, adminQueueName, j) 137 default: 138 queueBucket := int(utils.GenHashKey(j.dmlQueueKey)) % w.workerCount 139 w.updateJobMetricsFunc(false, queueBucketMapping[queueBucket], j) 140 startTime := time.Now() 141 w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dmlQueueKey)) 142 jobChs[queueBucket] <- j 143 w.metricProxies.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds()) 144 } 145 } 146 } 147 148 func (w *DMLWorker) sendJobToAllDmlQueue(j *job, jobChs []chan *job, queueBucketMapping []string) { 149 // flush for every DML queue 150 for i, jobCh := range jobChs { 151 startTime := time.Now() 152 jobCh <- j 153 w.metricProxies.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[i], w.source).Observe(time.Since(startTime).Seconds()) 154 } 155 } 156 157 // executeJobs execute jobs in same queueBucket 158 // All the jobs received should be executed consecutively. 159 func (w *DMLWorker) executeJobs(queueID int, jobCh chan *job) { 160 jobs := make([]*job, 0, w.batch) 161 workerJobIdx := dmlWorkerJobIdx(queueID) 162 queueBucket := queueBucketName(queueID) 163 for j := range jobCh { 164 w.metricProxies.QueueSizeGauge.WithLabelValues(w.task, queueBucket, w.source).Set(float64(len(jobCh))) 165 166 if j.tp != flush && j.tp != asyncFlush && j.tp != conflict { 167 if len(jobs) == 0 { 168 // set job TS when received first job of this batch. 169 w.lagFunc(j, workerJobIdx) 170 } 171 jobs = append(jobs, j) 172 if len(jobs) < w.batch && len(jobCh) > 0 { 173 continue 174 } 175 } 176 177 failpoint.Inject("syncDMLBatchNotFull", func() { 178 if len(jobCh) == 0 && len(jobs) < w.batch { 179 w.logger.Info("execute not full job queue") 180 } 181 }) 182 183 w.executeBatchJobs(queueID, jobs) 184 if j.tp == conflict || j.tp == flush || j.tp == asyncFlush { 185 j.flushWg.Done() 186 } 187 188 jobs = jobs[0:0] 189 if len(jobCh) == 0 { 190 failpoint.Inject("noJobInQueueLog", func() { 191 w.logger.Debug("no job in queue, update lag to zero", zap.Int( 192 "workerJobIdx", workerJobIdx), zap.Int64("current ts", time.Now().Unix())) 193 }) 194 w.lagFunc(nil, workerJobIdx) 195 } 196 } 197 } 198 199 // executeBatchJobs execute jobs with batch size. 200 func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) { 201 var ( 202 affect int 203 queries []string 204 args [][]interface{} 205 db = w.toDBConns[queueID] 206 err error 207 dmls = make([]*sqlmodel.RowChange, 0, len(jobs)) 208 ) 209 210 defer func() { 211 if err == nil { 212 w.successFunc(queueID, len(dmls), jobs) 213 } else { 214 if len(queries) == len(jobs) { 215 w.fatalFunc(jobs[affect], err) 216 } else { 217 w.logger.Warn("length of queries not equals length of jobs, cannot determine which job failed", zap.Int("queries", len(queries)), zap.Int("jobs", len(jobs))) 218 newJob := job{ 219 startLocation: jobs[0].startLocation, 220 currentLocation: jobs[len(jobs)-1].currentLocation, 221 } 222 w.fatalFunc(&newJob, err) 223 } 224 } 225 }() 226 227 if len(jobs) == 0 { 228 return 229 } 230 failpoint.Inject("failSecondJob", func() { 231 if failExecuteSQLForTest && failOnceForTest.CAS(false, true) { 232 w.logger.Info("trigger failSecondJob") 233 err = terror.ErrDBExecuteFailed.Delegate(errors.New("failSecondJob"), "mock") 234 failpoint.Return() 235 } 236 }) 237 238 queries, args = w.genSQLs(jobs) 239 failpoint.Inject("BlockExecuteSQLs", func(v failpoint.Value) { 240 t := v.(int) // sleep time 241 w.logger.Info("BlockExecuteSQLs", zap.Any("job", jobs[0]), zap.Int("sleep time", t)) 242 for _, query := range queries { 243 if strings.Contains(query, "UPDATE") && strings.Contains(query, "MetricsProxies") { 244 t = 10 245 w.logger.Info("BlockExecuteSQLs block for update sleep 10s for MetricsProxies it test", zap.Any("query", query)) 246 } 247 } 248 time.Sleep(time.Second * time.Duration(t)) 249 }) 250 failpoint.Inject("WaitUserCancel", func(v failpoint.Value) { 251 t := v.(int) 252 time.Sleep(time.Duration(t) * time.Second) 253 }) 254 // use background context to execute sqls as much as possible 255 // set timeout to maxDMLConnectionDuration to make sure dmls can be replicated to downstream event if the latency is high 256 // if users need to quit this asap, we can support pause-task/stop-task --force in the future 257 ctx, cancel := w.syncCtx.WithTimeout(maxDMLConnectionDuration) 258 defer cancel() 259 affect, err = db.ExecuteSQL(ctx, w.metricProxies, queries, args...) 260 failpoint.Inject("SafeModeExit", func(val failpoint.Value) { 261 if intVal, ok := val.(int); ok && intVal == 4 && len(jobs) > 0 { 262 w.logger.Warn("fail to exec DML", zap.String("failpoint", "SafeModeExit")) 263 affect, err = 0, terror.ErrDBExecuteFailed.Delegate(errors.New("SafeModeExit"), "mock") 264 } 265 }) 266 267 failpoint.Inject("ErrorOnLastDML", func(_ failpoint.Value) { 268 if len(queries) > len(jobs) { 269 w.logger.Error("error on last queries", zap.Int("queries", len(queries)), zap.Int("jobs", len(jobs))) 270 affect, err = len(queries)-1, terror.ErrDBExecuteFailed.Delegate(errors.New("ErrorOnLastDML"), "mock") 271 } 272 }) 273 274 if w.judgeKeyNotFound(affect, jobs) { 275 // throw an error if needed in the future. 276 // err = terror.ErrDBExecuteFailed.Delegate(errors.New("key not found"), "mock") 277 w.logger.Warn("no matching record is found to update/delete, ER_KEY_NOT_FOUND", zap.Int("affect", affect), zap.Int("jobs", len(jobs)), zap.Stringer("start from", jobs[0].startLocation), zap.Stringer("end at", jobs[len(jobs)-1].currentLocation)) 278 } 279 } 280 281 // genSQLs generate SQLs in single row mode or multiple rows mode. 282 func (w *DMLWorker) genSQLs(jobs []*job) ([]string, [][]interface{}) { 283 if w.multipleRows { 284 return genDMLsWithSameOp(jobs) 285 } 286 287 queries := make([]string, 0, len(jobs)) 288 args := make([][]interface{}, 0, len(jobs)) 289 for _, j := range jobs { 290 var query string 291 var arg []interface{} 292 appendQueryAndArg := func() { 293 queries = append(queries, query) 294 args = append(args, arg) 295 } 296 297 switch j.dml.Type() { 298 case sqlmodel.RowChangeInsert: 299 if j.safeMode { 300 query, arg = j.dml.GenSQL(sqlmodel.DMLReplace) 301 } else { 302 query, arg = j.dml.GenSQL(sqlmodel.DMLInsert) 303 } 304 305 case sqlmodel.RowChangeUpdate: 306 if j.safeMode { 307 query, arg = j.dml.GenSQL(sqlmodel.DMLDelete) 308 appendQueryAndArg() 309 query, arg = j.dml.GenSQL(sqlmodel.DMLReplace) 310 } else { 311 query, arg = j.dml.GenSQL(sqlmodel.DMLUpdate) 312 } 313 314 case sqlmodel.RowChangeDelete: 315 query, arg = j.dml.GenSQL(sqlmodel.DMLDelete) 316 } 317 318 appendQueryAndArg() 319 } 320 return queries, args 321 } 322 323 func (w *DMLWorker) judgeKeyNotFound(affect int, jobs []*job) bool { 324 // TODO: support compact and multiple rows 325 // In compact mode, we need to calculate the expected affected rows based on the compacted job 326 // while in multiple-rows, we need to calculate the affected rows based on the sql type 327 if w.compact || w.multipleRows { 328 return false 329 } 330 for _, j := range jobs { 331 if j.safeMode { 332 return false 333 } 334 } 335 return affect < len(jobs) 336 }