github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/dml.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package syncer 15 16 import ( 17 "encoding/binary" 18 19 "github.com/pingcap/tidb/pkg/expression" 20 "github.com/pingcap/tidb/pkg/parser/charset" 21 "github.com/pingcap/tidb/pkg/parser/model" 22 "github.com/pingcap/tidb/pkg/parser/mysql" 23 "github.com/pingcap/tidb/pkg/parser/types" 24 "github.com/pingcap/tidb/pkg/util/filter" 25 cdcmodel "github.com/pingcap/tiflow/cdc/model" 26 tcontext "github.com/pingcap/tiflow/dm/pkg/context" 27 "github.com/pingcap/tiflow/dm/pkg/log" 28 "github.com/pingcap/tiflow/dm/pkg/terror" 29 "github.com/pingcap/tiflow/dm/pkg/utils" 30 "github.com/pingcap/tiflow/pkg/sqlmodel" 31 "github.com/shopspring/decimal" 32 "go.uber.org/zap" 33 "golang.org/x/text/encoding/charmap" 34 ) 35 36 // genDMLParam stores original data and table structure. 37 type genDMLParam struct { 38 sourceTable *filter.Table // origin table 39 targetTable *filter.Table 40 safeMode bool // only used in update 41 originalData [][]interface{} // all data 42 sourceTableInfo *model.TableInfo // all table info 43 extendData [][]interface{} // all data include extend data 44 } 45 46 // latin1Decider is not usually ISO8859_1 in MySQL. 47 // ref https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html 48 var latin1Decoder = charmap.Windows1252.NewDecoder() 49 50 // adjustValueFromBinlogData adjust the values obtained from go-mysql so that 51 // - the values can be correctly converted to TiDB datum 52 // - the values are in the correct type that go-sql-driver/mysql uses. 53 func adjustValueFromBinlogData( 54 data []interface{}, 55 sourceTI *model.TableInfo, 56 ) ([]interface{}, error) { 57 value := make([]interface{}, 0, len(data)) 58 var err error 59 60 columns := make([]*model.ColumnInfo, 0, len(sourceTI.Columns)) 61 for _, col := range sourceTI.Columns { 62 if !col.Hidden { 63 columns = append(columns, col) 64 } 65 } 66 if len(data) != len(columns) { 67 return nil, terror.ErrSyncerUnitDMLColumnNotMatch.Generate(len(columns), len(data)) 68 } 69 70 for i, d := range data { 71 d = castUnsigned(d, &columns[i].FieldType) 72 isLatin1 := columns[i].GetCharset() == charset.CharsetLatin1 || columns[i].GetCharset() == "" && sourceTI.Charset == charset.CharsetLatin1 73 74 switch v := d.(type) { 75 case int8: 76 d = int64(v) 77 case int16: 78 d = int64(v) 79 case int32: 80 d = int64(v) 81 case uint8: 82 d = uint64(v) 83 case uint16: 84 d = uint64(v) 85 case uint32: 86 d = uint64(v) 87 case uint: 88 d = uint64(v) 89 case decimal.Decimal: 90 d = v.String() 91 case []byte: 92 if isLatin1 { 93 d, err = latin1Decoder.Bytes(v) 94 // replicate wrong data and don't break task 95 if err != nil { 96 log.L().DPanic("can't convert latin1 to utf8", zap.ByteString("value", v), zap.Error(err)) 97 } 98 } 99 case string: 100 isBinary := columns[i].GetType() == mysql.TypeString && mysql.HasBinaryFlag(columns[i].GetFlag()) 101 isGBK := columns[i].GetCharset() == charset.CharsetGBK || columns[i].GetCharset() == "" && sourceTI.Charset == charset.CharsetGBK 102 switch { 103 case isBinary: 104 // convert string to []byte so that go-sql-driver/mysql can use _binary'value' for DML 105 d = []byte(v) 106 // if column is binary and value length is less than column length, we need to pad the value with 0x00 107 // ref: https://dev.mysql.com/doc/refman/8.0/en/binary-varbinary.html 108 valLen := columns[i].FieldType.GetFlen() 109 if valLen != types.UnspecifiedLength && valLen > len(v) { 110 padding := make([]byte, valLen-len(v)) 111 d = append(d.([]byte), padding...) 112 } 113 case isGBK: 114 // convert string to []byte so that go-sql-driver/mysql can use _binary'value' for DML 115 d = []byte(v) 116 case isLatin1: 117 // TiDB has bug in latin1 so we must convert it to utf8 at DM's scope 118 // https://github.com/pingcap/tidb/issues/18955 119 d, err = latin1Decoder.String(v) 120 // replicate wrong data and don't break task 121 if err != nil { 122 log.L().DPanic("can't convert latin1 to utf8", zap.String("value", v), zap.Error(err)) 123 } 124 } 125 } 126 value = append(value, d) 127 } 128 return value, nil 129 } 130 131 // nolint:dupl 132 func (s *Syncer) genAndFilterInsertDMLs(tctx *tcontext.Context, param *genDMLParam, filterExprs []expression.Expression) ([]*sqlmodel.RowChange, error) { 133 var ( 134 tableID = utils.GenTableID(param.targetTable) 135 originalDataSeq = param.originalData 136 ti = param.sourceTableInfo 137 extendData = param.extendData 138 dmls = make([]*sqlmodel.RowChange, 0, len(originalDataSeq)) 139 ) 140 141 // if downstream pk/uk(not null) exits, then use downstream pk/uk(not null) 142 downstreamTableInfo, err := s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, ti) 143 if err != nil { 144 return nil, err 145 } 146 147 if extendData != nil { 148 originalDataSeq = extendData 149 } 150 151 RowLoop: 152 for _, data := range originalDataSeq { 153 originalValue, err := adjustValueFromBinlogData(data, ti) 154 if err != nil { 155 return nil, err 156 } 157 158 for _, expr := range filterExprs { 159 skip, err := SkipDMLByExpression(s.sessCtx, originalValue, expr, ti.Columns) 160 if err != nil { 161 return nil, err 162 } 163 if skip { 164 s.filteredInsert.Add(1) 165 continue RowLoop 166 } 167 } 168 169 rowChange := sqlmodel.NewRowChange( 170 &cdcmodel.TableName{Schema: param.sourceTable.Schema, Table: param.sourceTable.Name}, 171 &cdcmodel.TableName{Schema: param.targetTable.Schema, Table: param.targetTable.Name}, 172 nil, 173 originalValue, 174 param.sourceTableInfo, 175 downstreamTableInfo.TableInfo, 176 s.sessCtx, 177 ) 178 rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle) 179 dmls = append(dmls, rowChange) 180 } 181 182 return dmls, nil 183 } 184 185 // nolint:dupl 186 func (s *Syncer) genAndFilterUpdateDMLs( 187 tctx *tcontext.Context, 188 param *genDMLParam, 189 oldValueFilters []expression.Expression, 190 newValueFilters []expression.Expression, 191 ) ([]*sqlmodel.RowChange, error) { 192 var ( 193 tableID = utils.GenTableID(param.targetTable) 194 originalData = param.originalData 195 ti = param.sourceTableInfo 196 extendData = param.extendData 197 dmls = make([]*sqlmodel.RowChange, 0, len(originalData)/2) 198 ) 199 200 // if downstream pk/uk(not null) exits, then use downstream pk/uk(not null) 201 downstreamTableInfo, err := s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, ti) 202 if err != nil { 203 return nil, err 204 } 205 206 if extendData != nil { 207 originalData = extendData 208 } 209 210 RowLoop: 211 for i := 0; i < len(originalData); i += 2 { 212 oriOldData := originalData[i] 213 oriChangedData := originalData[i+1] 214 215 if len(oriOldData) != len(oriChangedData) { 216 return nil, terror.ErrSyncerUnitDMLOldNewValueMismatch.Generate(len(oriOldData), len(oriChangedData)) 217 } 218 219 oriOldValues, err := adjustValueFromBinlogData(oriOldData, ti) 220 if err != nil { 221 return nil, err 222 } 223 oriChangedValues, err := adjustValueFromBinlogData(oriChangedData, ti) 224 if err != nil { 225 return nil, err 226 } 227 228 for j := range oldValueFilters { 229 // AND logic 230 oldExpr, newExpr := oldValueFilters[j], newValueFilters[j] 231 skip1, err := SkipDMLByExpression(s.sessCtx, oriOldValues, oldExpr, ti.Columns) 232 if err != nil { 233 return nil, err 234 } 235 skip2, err := SkipDMLByExpression(s.sessCtx, oriChangedValues, newExpr, ti.Columns) 236 if err != nil { 237 return nil, err 238 } 239 if skip1 && skip2 { 240 s.filteredUpdate.Add(1) 241 // TODO: we skip generating the UPDATE SQL, so we left the old value here. Is this expected? 242 continue RowLoop 243 } 244 } 245 246 rowChange := sqlmodel.NewRowChange( 247 &cdcmodel.TableName{Schema: param.sourceTable.Schema, Table: param.sourceTable.Name}, 248 &cdcmodel.TableName{Schema: param.targetTable.Schema, Table: param.targetTable.Name}, 249 oriOldValues, 250 oriChangedValues, 251 param.sourceTableInfo, 252 downstreamTableInfo.TableInfo, 253 s.sessCtx, 254 ) 255 rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle) 256 dmls = append(dmls, rowChange) 257 } 258 259 return dmls, nil 260 } 261 262 // nolint:dupl 263 func (s *Syncer) genAndFilterDeleteDMLs(tctx *tcontext.Context, param *genDMLParam, filterExprs []expression.Expression) ([]*sqlmodel.RowChange, error) { 264 var ( 265 tableID = utils.GenTableID(param.targetTable) 266 dataSeq = param.originalData 267 ti = param.sourceTableInfo 268 extendData = param.extendData 269 dmls = make([]*sqlmodel.RowChange, 0, len(dataSeq)) 270 ) 271 272 // if downstream pk/uk(not null) exits, then use downstream pk/uk(not null) 273 downstreamTableInfo, err := s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, ti) 274 if err != nil { 275 return nil, err 276 } 277 278 if extendData != nil { 279 dataSeq = extendData 280 } 281 282 RowLoop: 283 for _, data := range dataSeq { 284 value, err := adjustValueFromBinlogData(data, ti) 285 if err != nil { 286 return nil, err 287 } 288 289 for _, expr := range filterExprs { 290 skip, err := SkipDMLByExpression(s.sessCtx, value, expr, ti.Columns) 291 if err != nil { 292 return nil, err 293 } 294 if skip { 295 s.filteredDelete.Add(1) 296 continue RowLoop 297 } 298 } 299 300 rowChange := sqlmodel.NewRowChange( 301 &cdcmodel.TableName{Schema: param.sourceTable.Schema, Table: param.sourceTable.Name}, 302 &cdcmodel.TableName{Schema: param.targetTable.Schema, Table: param.targetTable.Name}, 303 value, 304 nil, 305 param.sourceTableInfo, 306 downstreamTableInfo.TableInfo, 307 s.sessCtx, 308 ) 309 rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle) 310 dmls = append(dmls, rowChange) 311 } 312 313 return dmls, nil 314 } 315 316 func castUnsigned(data interface{}, ft *types.FieldType) interface{} { 317 if !mysql.HasUnsignedFlag(ft.GetFlag()) { 318 return data 319 } 320 321 switch v := data.(type) { 322 case int: 323 return uint(v) 324 case int8: 325 return uint8(v) 326 case int16: 327 return uint16(v) 328 case int32: 329 if ft.GetType() == mysql.TypeInt24 { 330 // we use int32 to store MEDIUMINT, if the value is signed, it's fine 331 // but if the value is un-signed, simply convert it use `uint32` may out of the range 332 // like -4692783 converted to 4290274513 (2^32 - 4692783), but we expect 12084433 (2^24 - 4692783) 333 data := make([]byte, 4) 334 binary.LittleEndian.PutUint32(data, uint32(v)) 335 return uint32(data[0]) | uint32(data[1])<<8 | uint32(data[2])<<16 336 } 337 return uint32(v) 338 case int64: 339 return uint64(v) 340 } 341 342 return data 343 } 344 345 // checkLogColumns returns error when not all rows in skipped is empty, which means the binlog doesn't contain all 346 // columns. 347 // TODO: don't return error when all skipped columns is non-PK. 348 func checkLogColumns(skipped [][]int) error { 349 for _, row := range skipped { 350 if len(row) > 0 { 351 return terror.ErrBinlogNotLogColumn 352 } 353 } 354 return nil 355 } 356 357 // genSQLMultipleRows generates multiple rows SQL with different dmlOpType. 358 func genSQLMultipleRows(op sqlmodel.DMLType, dmls []*sqlmodel.RowChange) (queries string, args []interface{}) { 359 if len(dmls) > 1 { 360 log.L().Debug("generate DMLs with multiple rows", zap.Stringer("op", op), zap.Stringer("original op", dmls[0].Type()), zap.Int("rows", len(dmls))) 361 } 362 switch op { 363 case sqlmodel.DMLInsert, sqlmodel.DMLReplace, sqlmodel.DMLInsertOnDuplicateUpdate: 364 return sqlmodel.GenInsertSQL(op, dmls...) 365 case sqlmodel.DMLUpdate: 366 return sqlmodel.GenUpdateSQL(dmls...) 367 case sqlmodel.DMLDelete: 368 return sqlmodel.GenDeleteSQL(dmls...) 369 } 370 return 371 } 372 373 // genDMLsWithSameCols group and gen dmls by same columns. 374 // in optimistic shard mode, different upstream tables may have different columns. 375 // e.g. 376 // insert into tb(a,b,c) values(1,1,1) 377 // insert into tb(a,b,d) values(2,2,2) 378 // we can only combine DMLs with same column names. 379 // all dmls should have same dmlOpType and same tableName. 380 func genDMLsWithSameCols(op sqlmodel.DMLType, dmls []*sqlmodel.RowChange) ([]string, [][]interface{}) { 381 queries := make([]string, 0, len(dmls)) 382 args := make([][]interface{}, 0, len(dmls)) 383 var lastDML *sqlmodel.RowChange 384 var query string 385 var arg []interface{} 386 groupDMLs := make([]*sqlmodel.RowChange, 0, len(dmls)) 387 388 // group dmls by same columns 389 for i, dml := range dmls { 390 if i == 0 { 391 lastDML = dml 392 } 393 if !sqlmodel.SameTypeTargetAndColumns(lastDML, dml) { 394 query, arg = genSQLMultipleRows(op, groupDMLs) 395 queries = append(queries, query) 396 args = append(args, arg) 397 398 groupDMLs = groupDMLs[0:0] 399 lastDML = dml 400 } 401 groupDMLs = append(groupDMLs, dml) 402 } 403 if len(groupDMLs) > 0 { 404 query, arg = genSQLMultipleRows(op, groupDMLs) 405 queries = append(queries, query) 406 args = append(args, arg) 407 } 408 return queries, args 409 } 410 411 // genDMLsWithSameTable groups and generates dmls with same table. 412 // all the dmls should have same dmlOpType. 413 func genDMLsWithSameTable(op sqlmodel.DMLType, jobs []*job) ([]string, [][]interface{}) { 414 queries := make([]string, 0, len(jobs)) 415 args := make([][]interface{}, 0, len(jobs)) 416 var lastTable string 417 groupDMLs := make([]*sqlmodel.RowChange, 0, len(jobs)) 418 419 if op == sqlmodel.DMLUpdate { 420 for i, j := range jobs { 421 if j.safeMode { 422 query, arg := j.dml.GenSQL(sqlmodel.DMLDelete) 423 queries = append(queries, query) 424 args = append(args, arg) 425 query, arg = j.dml.GenSQL(sqlmodel.DMLReplace) 426 queries = append(queries, query) 427 args = append(args, arg) 428 continue 429 } 430 431 if i == 0 { 432 lastTable = j.dml.TargetTableID() 433 } 434 if lastTable != j.dml.TargetTableID() { 435 query, arg := genDMLsWithSameCols(op, groupDMLs) 436 queries = append(queries, query...) 437 args = append(args, arg...) 438 439 groupDMLs = groupDMLs[0:0] 440 lastTable = j.dml.TargetTableID() 441 } 442 groupDMLs = append(groupDMLs, j.dml) 443 } 444 if len(groupDMLs) > 0 { 445 query, arg := genDMLsWithSameCols(op, groupDMLs) 446 queries = append(queries, query...) 447 args = append(args, arg...) 448 } 449 return queries, args 450 } 451 452 // group dmls with same table 453 for i, j := range jobs { 454 if i == 0 { 455 lastTable = j.dml.TargetTableID() 456 } 457 if lastTable != j.dml.TargetTableID() { 458 query, arg := genDMLsWithSameCols(op, groupDMLs) 459 queries = append(queries, query...) 460 args = append(args, arg...) 461 462 groupDMLs = groupDMLs[0:0] 463 lastTable = j.dml.TargetTableID() 464 } 465 groupDMLs = append(groupDMLs, j.dml) 466 } 467 if len(groupDMLs) > 0 { 468 query, arg := genDMLsWithSameCols(op, groupDMLs) 469 queries = append(queries, query...) 470 args = append(args, arg...) 471 } 472 return queries, args 473 } 474 475 // genDMLsWithSameOp groups and generates dmls by dmlOpType. 476 // TODO: implement a volcano iterator interface for genDMLsWithSameXXX. 477 func genDMLsWithSameOp(jobs []*job) ([]string, [][]interface{}) { 478 queries := make([]string, 0, len(jobs)) 479 args := make([][]interface{}, 0, len(jobs)) 480 var lastOp sqlmodel.DMLType 481 jobsWithSameOp := make([]*job, 0, len(jobs)) 482 483 // group dmls with same dmlOp 484 for i, j := range jobs { 485 var curOp sqlmodel.DMLType 486 switch j.dml.Type() { 487 case sqlmodel.RowChangeUpdate: 488 // if update statement didn't update identify values and not in safemode, regard it as insert on duplicate. 489 if !j.dml.IsIdentityUpdated() && !j.safeMode { 490 curOp = sqlmodel.DMLInsertOnDuplicateUpdate 491 break 492 } 493 494 curOp = sqlmodel.DMLUpdate 495 case sqlmodel.RowChangeInsert: 496 // if insert with safemode, regard it as replace 497 if j.safeMode { 498 curOp = sqlmodel.DMLReplace 499 break 500 } 501 502 curOp = sqlmodel.DMLInsert 503 case sqlmodel.RowChangeDelete: 504 curOp = sqlmodel.DMLDelete 505 } 506 507 if i == 0 { 508 lastOp = curOp 509 } 510 511 // now there are 5 situations: [insert, replace(insert with safemode), insert on duplicate(update without identify keys), update(update identify keys/update with safemode), delete] 512 if lastOp != curOp { 513 query, arg := genDMLsWithSameTable(lastOp, jobsWithSameOp) 514 queries = append(queries, query...) 515 args = append(args, arg...) 516 517 jobsWithSameOp = jobsWithSameOp[0:0] 518 lastOp = curOp 519 } 520 jobsWithSameOp = append(jobsWithSameOp, j) 521 } 522 if len(jobsWithSameOp) > 0 { 523 query, arg := genDMLsWithSameTable(lastOp, jobsWithSameOp) 524 queries = append(queries, query...) 525 args = append(args, arg...) 526 } 527 return queries, args 528 }