github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/dml.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package syncer
    15  
    16  import (
    17  	"encoding/binary"
    18  
    19  	"github.com/pingcap/tidb/pkg/expression"
    20  	"github.com/pingcap/tidb/pkg/parser/charset"
    21  	"github.com/pingcap/tidb/pkg/parser/model"
    22  	"github.com/pingcap/tidb/pkg/parser/mysql"
    23  	"github.com/pingcap/tidb/pkg/parser/types"
    24  	"github.com/pingcap/tidb/pkg/util/filter"
    25  	cdcmodel "github.com/pingcap/tiflow/cdc/model"
    26  	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
    27  	"github.com/pingcap/tiflow/dm/pkg/log"
    28  	"github.com/pingcap/tiflow/dm/pkg/terror"
    29  	"github.com/pingcap/tiflow/dm/pkg/utils"
    30  	"github.com/pingcap/tiflow/pkg/sqlmodel"
    31  	"github.com/shopspring/decimal"
    32  	"go.uber.org/zap"
    33  	"golang.org/x/text/encoding/charmap"
    34  )
    35  
    36  // genDMLParam stores original data and table structure.
    37  type genDMLParam struct {
    38  	sourceTable     *filter.Table // origin table
    39  	targetTable     *filter.Table
    40  	safeMode        bool             // only used in update
    41  	originalData    [][]interface{}  // all data
    42  	sourceTableInfo *model.TableInfo // all table info
    43  	extendData      [][]interface{}  // all data include extend data
    44  }
    45  
    46  // latin1Decider is not usually ISO8859_1 in MySQL.
    47  // ref https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html
    48  var latin1Decoder = charmap.Windows1252.NewDecoder()
    49  
    50  // adjustValueFromBinlogData adjust the values obtained from go-mysql so that
    51  // - the values can be correctly converted to TiDB datum
    52  // - the values are in the correct type that go-sql-driver/mysql uses.
    53  func adjustValueFromBinlogData(
    54  	data []interface{},
    55  	sourceTI *model.TableInfo,
    56  ) ([]interface{}, error) {
    57  	value := make([]interface{}, 0, len(data))
    58  	var err error
    59  
    60  	columns := make([]*model.ColumnInfo, 0, len(sourceTI.Columns))
    61  	for _, col := range sourceTI.Columns {
    62  		if !col.Hidden {
    63  			columns = append(columns, col)
    64  		}
    65  	}
    66  	if len(data) != len(columns) {
    67  		return nil, terror.ErrSyncerUnitDMLColumnNotMatch.Generate(len(columns), len(data))
    68  	}
    69  
    70  	for i, d := range data {
    71  		d = castUnsigned(d, &columns[i].FieldType)
    72  		isLatin1 := columns[i].GetCharset() == charset.CharsetLatin1 || columns[i].GetCharset() == "" && sourceTI.Charset == charset.CharsetLatin1
    73  
    74  		switch v := d.(type) {
    75  		case int8:
    76  			d = int64(v)
    77  		case int16:
    78  			d = int64(v)
    79  		case int32:
    80  			d = int64(v)
    81  		case uint8:
    82  			d = uint64(v)
    83  		case uint16:
    84  			d = uint64(v)
    85  		case uint32:
    86  			d = uint64(v)
    87  		case uint:
    88  			d = uint64(v)
    89  		case decimal.Decimal:
    90  			d = v.String()
    91  		case []byte:
    92  			if isLatin1 {
    93  				d, err = latin1Decoder.Bytes(v)
    94  				// replicate wrong data and don't break task
    95  				if err != nil {
    96  					log.L().DPanic("can't convert latin1 to utf8", zap.ByteString("value", v), zap.Error(err))
    97  				}
    98  			}
    99  		case string:
   100  			isBinary := columns[i].GetType() == mysql.TypeString && mysql.HasBinaryFlag(columns[i].GetFlag())
   101  			isGBK := columns[i].GetCharset() == charset.CharsetGBK || columns[i].GetCharset() == "" && sourceTI.Charset == charset.CharsetGBK
   102  			switch {
   103  			case isBinary:
   104  				// convert string to []byte so that go-sql-driver/mysql can use _binary'value' for DML
   105  				d = []byte(v)
   106  				// if column is binary and value length is less than column length, we need to pad the value with 0x00
   107  				// ref: https://dev.mysql.com/doc/refman/8.0/en/binary-varbinary.html
   108  				valLen := columns[i].FieldType.GetFlen()
   109  				if valLen != types.UnspecifiedLength && valLen > len(v) {
   110  					padding := make([]byte, valLen-len(v))
   111  					d = append(d.([]byte), padding...)
   112  				}
   113  			case isGBK:
   114  				// convert string to []byte so that go-sql-driver/mysql can use _binary'value' for DML
   115  				d = []byte(v)
   116  			case isLatin1:
   117  				// TiDB has bug in latin1 so we must convert it to utf8 at DM's scope
   118  				// https://github.com/pingcap/tidb/issues/18955
   119  				d, err = latin1Decoder.String(v)
   120  				// replicate wrong data and don't break task
   121  				if err != nil {
   122  					log.L().DPanic("can't convert latin1 to utf8", zap.String("value", v), zap.Error(err))
   123  				}
   124  			}
   125  		}
   126  		value = append(value, d)
   127  	}
   128  	return value, nil
   129  }
   130  
   131  // nolint:dupl
   132  func (s *Syncer) genAndFilterInsertDMLs(tctx *tcontext.Context, param *genDMLParam, filterExprs []expression.Expression) ([]*sqlmodel.RowChange, error) {
   133  	var (
   134  		tableID         = utils.GenTableID(param.targetTable)
   135  		originalDataSeq = param.originalData
   136  		ti              = param.sourceTableInfo
   137  		extendData      = param.extendData
   138  		dmls            = make([]*sqlmodel.RowChange, 0, len(originalDataSeq))
   139  	)
   140  
   141  	// if downstream pk/uk(not null) exits, then use downstream pk/uk(not null)
   142  	downstreamTableInfo, err := s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, ti)
   143  	if err != nil {
   144  		return nil, err
   145  	}
   146  
   147  	if extendData != nil {
   148  		originalDataSeq = extendData
   149  	}
   150  
   151  RowLoop:
   152  	for _, data := range originalDataSeq {
   153  		originalValue, err := adjustValueFromBinlogData(data, ti)
   154  		if err != nil {
   155  			return nil, err
   156  		}
   157  
   158  		for _, expr := range filterExprs {
   159  			skip, err := SkipDMLByExpression(s.sessCtx, originalValue, expr, ti.Columns)
   160  			if err != nil {
   161  				return nil, err
   162  			}
   163  			if skip {
   164  				s.filteredInsert.Add(1)
   165  				continue RowLoop
   166  			}
   167  		}
   168  
   169  		rowChange := sqlmodel.NewRowChange(
   170  			&cdcmodel.TableName{Schema: param.sourceTable.Schema, Table: param.sourceTable.Name},
   171  			&cdcmodel.TableName{Schema: param.targetTable.Schema, Table: param.targetTable.Name},
   172  			nil,
   173  			originalValue,
   174  			param.sourceTableInfo,
   175  			downstreamTableInfo.TableInfo,
   176  			s.sessCtx,
   177  		)
   178  		rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle)
   179  		dmls = append(dmls, rowChange)
   180  	}
   181  
   182  	return dmls, nil
   183  }
   184  
   185  // nolint:dupl
   186  func (s *Syncer) genAndFilterUpdateDMLs(
   187  	tctx *tcontext.Context,
   188  	param *genDMLParam,
   189  	oldValueFilters []expression.Expression,
   190  	newValueFilters []expression.Expression,
   191  ) ([]*sqlmodel.RowChange, error) {
   192  	var (
   193  		tableID      = utils.GenTableID(param.targetTable)
   194  		originalData = param.originalData
   195  		ti           = param.sourceTableInfo
   196  		extendData   = param.extendData
   197  		dmls         = make([]*sqlmodel.RowChange, 0, len(originalData)/2)
   198  	)
   199  
   200  	// if downstream pk/uk(not null) exits, then use downstream pk/uk(not null)
   201  	downstreamTableInfo, err := s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, ti)
   202  	if err != nil {
   203  		return nil, err
   204  	}
   205  
   206  	if extendData != nil {
   207  		originalData = extendData
   208  	}
   209  
   210  RowLoop:
   211  	for i := 0; i < len(originalData); i += 2 {
   212  		oriOldData := originalData[i]
   213  		oriChangedData := originalData[i+1]
   214  
   215  		if len(oriOldData) != len(oriChangedData) {
   216  			return nil, terror.ErrSyncerUnitDMLOldNewValueMismatch.Generate(len(oriOldData), len(oriChangedData))
   217  		}
   218  
   219  		oriOldValues, err := adjustValueFromBinlogData(oriOldData, ti)
   220  		if err != nil {
   221  			return nil, err
   222  		}
   223  		oriChangedValues, err := adjustValueFromBinlogData(oriChangedData, ti)
   224  		if err != nil {
   225  			return nil, err
   226  		}
   227  
   228  		for j := range oldValueFilters {
   229  			// AND logic
   230  			oldExpr, newExpr := oldValueFilters[j], newValueFilters[j]
   231  			skip1, err := SkipDMLByExpression(s.sessCtx, oriOldValues, oldExpr, ti.Columns)
   232  			if err != nil {
   233  				return nil, err
   234  			}
   235  			skip2, err := SkipDMLByExpression(s.sessCtx, oriChangedValues, newExpr, ti.Columns)
   236  			if err != nil {
   237  				return nil, err
   238  			}
   239  			if skip1 && skip2 {
   240  				s.filteredUpdate.Add(1)
   241  				// TODO: we skip generating the UPDATE SQL, so we left the old value here. Is this expected?
   242  				continue RowLoop
   243  			}
   244  		}
   245  
   246  		rowChange := sqlmodel.NewRowChange(
   247  			&cdcmodel.TableName{Schema: param.sourceTable.Schema, Table: param.sourceTable.Name},
   248  			&cdcmodel.TableName{Schema: param.targetTable.Schema, Table: param.targetTable.Name},
   249  			oriOldValues,
   250  			oriChangedValues,
   251  			param.sourceTableInfo,
   252  			downstreamTableInfo.TableInfo,
   253  			s.sessCtx,
   254  		)
   255  		rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle)
   256  		dmls = append(dmls, rowChange)
   257  	}
   258  
   259  	return dmls, nil
   260  }
   261  
   262  // nolint:dupl
   263  func (s *Syncer) genAndFilterDeleteDMLs(tctx *tcontext.Context, param *genDMLParam, filterExprs []expression.Expression) ([]*sqlmodel.RowChange, error) {
   264  	var (
   265  		tableID    = utils.GenTableID(param.targetTable)
   266  		dataSeq    = param.originalData
   267  		ti         = param.sourceTableInfo
   268  		extendData = param.extendData
   269  		dmls       = make([]*sqlmodel.RowChange, 0, len(dataSeq))
   270  	)
   271  
   272  	// if downstream pk/uk(not null) exits, then use downstream pk/uk(not null)
   273  	downstreamTableInfo, err := s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, ti)
   274  	if err != nil {
   275  		return nil, err
   276  	}
   277  
   278  	if extendData != nil {
   279  		dataSeq = extendData
   280  	}
   281  
   282  RowLoop:
   283  	for _, data := range dataSeq {
   284  		value, err := adjustValueFromBinlogData(data, ti)
   285  		if err != nil {
   286  			return nil, err
   287  		}
   288  
   289  		for _, expr := range filterExprs {
   290  			skip, err := SkipDMLByExpression(s.sessCtx, value, expr, ti.Columns)
   291  			if err != nil {
   292  				return nil, err
   293  			}
   294  			if skip {
   295  				s.filteredDelete.Add(1)
   296  				continue RowLoop
   297  			}
   298  		}
   299  
   300  		rowChange := sqlmodel.NewRowChange(
   301  			&cdcmodel.TableName{Schema: param.sourceTable.Schema, Table: param.sourceTable.Name},
   302  			&cdcmodel.TableName{Schema: param.targetTable.Schema, Table: param.targetTable.Name},
   303  			value,
   304  			nil,
   305  			param.sourceTableInfo,
   306  			downstreamTableInfo.TableInfo,
   307  			s.sessCtx,
   308  		)
   309  		rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle)
   310  		dmls = append(dmls, rowChange)
   311  	}
   312  
   313  	return dmls, nil
   314  }
   315  
   316  func castUnsigned(data interface{}, ft *types.FieldType) interface{} {
   317  	if !mysql.HasUnsignedFlag(ft.GetFlag()) {
   318  		return data
   319  	}
   320  
   321  	switch v := data.(type) {
   322  	case int:
   323  		return uint(v)
   324  	case int8:
   325  		return uint8(v)
   326  	case int16:
   327  		return uint16(v)
   328  	case int32:
   329  		if ft.GetType() == mysql.TypeInt24 {
   330  			// we use int32 to store MEDIUMINT, if the value is signed, it's fine
   331  			// but if the value is un-signed, simply convert it use `uint32` may out of the range
   332  			// like -4692783 converted to 4290274513 (2^32 - 4692783), but we expect 12084433 (2^24 - 4692783)
   333  			data := make([]byte, 4)
   334  			binary.LittleEndian.PutUint32(data, uint32(v))
   335  			return uint32(data[0]) | uint32(data[1])<<8 | uint32(data[2])<<16
   336  		}
   337  		return uint32(v)
   338  	case int64:
   339  		return uint64(v)
   340  	}
   341  
   342  	return data
   343  }
   344  
   345  // checkLogColumns returns error when not all rows in skipped is empty, which means the binlog doesn't contain all
   346  // columns.
   347  // TODO: don't return error when all skipped columns is non-PK.
   348  func checkLogColumns(skipped [][]int) error {
   349  	for _, row := range skipped {
   350  		if len(row) > 0 {
   351  			return terror.ErrBinlogNotLogColumn
   352  		}
   353  	}
   354  	return nil
   355  }
   356  
   357  // genSQLMultipleRows generates multiple rows SQL with different dmlOpType.
   358  func genSQLMultipleRows(op sqlmodel.DMLType, dmls []*sqlmodel.RowChange) (queries string, args []interface{}) {
   359  	if len(dmls) > 1 {
   360  		log.L().Debug("generate DMLs with multiple rows", zap.Stringer("op", op), zap.Stringer("original op", dmls[0].Type()), zap.Int("rows", len(dmls)))
   361  	}
   362  	switch op {
   363  	case sqlmodel.DMLInsert, sqlmodel.DMLReplace, sqlmodel.DMLInsertOnDuplicateUpdate:
   364  		return sqlmodel.GenInsertSQL(op, dmls...)
   365  	case sqlmodel.DMLUpdate:
   366  		return sqlmodel.GenUpdateSQL(dmls...)
   367  	case sqlmodel.DMLDelete:
   368  		return sqlmodel.GenDeleteSQL(dmls...)
   369  	}
   370  	return
   371  }
   372  
   373  // genDMLsWithSameCols group and gen dmls by same columns.
   374  // in optimistic shard mode, different upstream tables may have different columns.
   375  // e.g.
   376  // insert into tb(a,b,c) values(1,1,1)
   377  // insert into tb(a,b,d) values(2,2,2)
   378  // we can only combine DMLs with same column names.
   379  // all dmls should have same dmlOpType and same tableName.
   380  func genDMLsWithSameCols(op sqlmodel.DMLType, dmls []*sqlmodel.RowChange) ([]string, [][]interface{}) {
   381  	queries := make([]string, 0, len(dmls))
   382  	args := make([][]interface{}, 0, len(dmls))
   383  	var lastDML *sqlmodel.RowChange
   384  	var query string
   385  	var arg []interface{}
   386  	groupDMLs := make([]*sqlmodel.RowChange, 0, len(dmls))
   387  
   388  	// group dmls by same columns
   389  	for i, dml := range dmls {
   390  		if i == 0 {
   391  			lastDML = dml
   392  		}
   393  		if !sqlmodel.SameTypeTargetAndColumns(lastDML, dml) {
   394  			query, arg = genSQLMultipleRows(op, groupDMLs)
   395  			queries = append(queries, query)
   396  			args = append(args, arg)
   397  
   398  			groupDMLs = groupDMLs[0:0]
   399  			lastDML = dml
   400  		}
   401  		groupDMLs = append(groupDMLs, dml)
   402  	}
   403  	if len(groupDMLs) > 0 {
   404  		query, arg = genSQLMultipleRows(op, groupDMLs)
   405  		queries = append(queries, query)
   406  		args = append(args, arg)
   407  	}
   408  	return queries, args
   409  }
   410  
   411  // genDMLsWithSameTable groups and generates dmls with same table.
   412  // all the dmls should have same dmlOpType.
   413  func genDMLsWithSameTable(op sqlmodel.DMLType, jobs []*job) ([]string, [][]interface{}) {
   414  	queries := make([]string, 0, len(jobs))
   415  	args := make([][]interface{}, 0, len(jobs))
   416  	var lastTable string
   417  	groupDMLs := make([]*sqlmodel.RowChange, 0, len(jobs))
   418  
   419  	if op == sqlmodel.DMLUpdate {
   420  		for i, j := range jobs {
   421  			if j.safeMode {
   422  				query, arg := j.dml.GenSQL(sqlmodel.DMLDelete)
   423  				queries = append(queries, query)
   424  				args = append(args, arg)
   425  				query, arg = j.dml.GenSQL(sqlmodel.DMLReplace)
   426  				queries = append(queries, query)
   427  				args = append(args, arg)
   428  				continue
   429  			}
   430  
   431  			if i == 0 {
   432  				lastTable = j.dml.TargetTableID()
   433  			}
   434  			if lastTable != j.dml.TargetTableID() {
   435  				query, arg := genDMLsWithSameCols(op, groupDMLs)
   436  				queries = append(queries, query...)
   437  				args = append(args, arg...)
   438  
   439  				groupDMLs = groupDMLs[0:0]
   440  				lastTable = j.dml.TargetTableID()
   441  			}
   442  			groupDMLs = append(groupDMLs, j.dml)
   443  		}
   444  		if len(groupDMLs) > 0 {
   445  			query, arg := genDMLsWithSameCols(op, groupDMLs)
   446  			queries = append(queries, query...)
   447  			args = append(args, arg...)
   448  		}
   449  		return queries, args
   450  	}
   451  
   452  	// group dmls with same table
   453  	for i, j := range jobs {
   454  		if i == 0 {
   455  			lastTable = j.dml.TargetTableID()
   456  		}
   457  		if lastTable != j.dml.TargetTableID() {
   458  			query, arg := genDMLsWithSameCols(op, groupDMLs)
   459  			queries = append(queries, query...)
   460  			args = append(args, arg...)
   461  
   462  			groupDMLs = groupDMLs[0:0]
   463  			lastTable = j.dml.TargetTableID()
   464  		}
   465  		groupDMLs = append(groupDMLs, j.dml)
   466  	}
   467  	if len(groupDMLs) > 0 {
   468  		query, arg := genDMLsWithSameCols(op, groupDMLs)
   469  		queries = append(queries, query...)
   470  		args = append(args, arg...)
   471  	}
   472  	return queries, args
   473  }
   474  
   475  // genDMLsWithSameOp groups and generates dmls by dmlOpType.
   476  // TODO: implement a volcano iterator interface for genDMLsWithSameXXX.
   477  func genDMLsWithSameOp(jobs []*job) ([]string, [][]interface{}) {
   478  	queries := make([]string, 0, len(jobs))
   479  	args := make([][]interface{}, 0, len(jobs))
   480  	var lastOp sqlmodel.DMLType
   481  	jobsWithSameOp := make([]*job, 0, len(jobs))
   482  
   483  	// group dmls with same dmlOp
   484  	for i, j := range jobs {
   485  		var curOp sqlmodel.DMLType
   486  		switch j.dml.Type() {
   487  		case sqlmodel.RowChangeUpdate:
   488  			// if update statement didn't update identify values and not in safemode, regard it as insert on duplicate.
   489  			if !j.dml.IsIdentityUpdated() && !j.safeMode {
   490  				curOp = sqlmodel.DMLInsertOnDuplicateUpdate
   491  				break
   492  			}
   493  
   494  			curOp = sqlmodel.DMLUpdate
   495  		case sqlmodel.RowChangeInsert:
   496  			// if insert with safemode, regard it as replace
   497  			if j.safeMode {
   498  				curOp = sqlmodel.DMLReplace
   499  				break
   500  			}
   501  
   502  			curOp = sqlmodel.DMLInsert
   503  		case sqlmodel.RowChangeDelete:
   504  			curOp = sqlmodel.DMLDelete
   505  		}
   506  
   507  		if i == 0 {
   508  			lastOp = curOp
   509  		}
   510  
   511  		// now there are 5 situations: [insert, replace(insert with safemode), insert on duplicate(update without identify keys), update(update identify keys/update with safemode), delete]
   512  		if lastOp != curOp {
   513  			query, arg := genDMLsWithSameTable(lastOp, jobsWithSameOp)
   514  			queries = append(queries, query...)
   515  			args = append(args, arg...)
   516  
   517  			jobsWithSameOp = jobsWithSameOp[0:0]
   518  			lastOp = curOp
   519  		}
   520  		jobsWithSameOp = append(jobsWithSameOp, j)
   521  	}
   522  	if len(jobsWithSameOp) > 0 {
   523  		query, arg := genDMLsWithSameTable(lastOp, jobsWithSameOp)
   524  		queries = append(queries, query...)
   525  		args = append(args, arg...)
   526  	}
   527  	return queries, args
   528  }