github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/sink/codec/craft/model.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License")
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.orglicensesLICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package craft
    15  
    16  import (
    17  	"github.com/pingcap/errors"
    18  	"github.com/pingcap/tiflow/cdc/model"
    19  	cerror "github.com/pingcap/tiflow/pkg/errors"
    20  )
    21  
    22  const (
    23  	// Version1 represents the version of craft format
    24  	Version1 uint64 = 1
    25  
    26  	// DefaultBufferCapacity is default buffer size
    27  	DefaultBufferCapacity = 1024
    28  
    29  	// Column group types
    30  	columnGroupTypeOld = 0x2
    31  	columnGroupTypeNew = 0x1
    32  
    33  	// Size tables index
    34  	metaSizeTableIndex             = 0
    35  	bodySizeTableIndex             = 1
    36  	columnGroupSizeTableStartIndex = 2
    37  
    38  	// meta size table index
    39  	headerSizeIndex         = 0
    40  	termDictionarySizeIndex = 1
    41  	maxMetaSizeIndex        = termDictionarySizeIndex
    42  
    43  	nullInt64 = -1
    44  )
    45  
    46  var (
    47  	oneNullInt64Slice           = []int64{nullInt64}
    48  	oneNullStringSlice          = []*string{nil}
    49  	emptyDecodingTermDictionary = &termDictionary{
    50  		id: make([]string, 0),
    51  	}
    52  )
    53  
    54  type termDictionary struct {
    55  	term map[string]int
    56  	id   []string
    57  }
    58  
    59  func newEncodingTermDictionaryWithSize(size int) *termDictionary {
    60  	return &termDictionary{
    61  		term: make(map[string]int),
    62  		id:   make([]string, 0, size),
    63  	}
    64  }
    65  
    66  func newEncodingTermDictionary() *termDictionary {
    67  	return newEncodingTermDictionaryWithSize(8) // TODO, this number should be evaluated
    68  }
    69  
    70  func (d *termDictionary) encodeNullable(s *string) int64 {
    71  	if s == nil {
    72  		return nullInt64
    73  	}
    74  	return d.encode(*s)
    75  }
    76  
    77  func (d *termDictionary) encode(s string) int64 {
    78  	id, ok := d.term[s]
    79  	if !ok {
    80  		id := len(d.id)
    81  		d.term[s] = id
    82  		d.id = append(d.id, s)
    83  		return int64(id)
    84  	}
    85  	return int64(id)
    86  }
    87  
    88  func (d *termDictionary) encodeNullableChunk(array []*string) []int64 {
    89  	result := make([]int64, len(array))
    90  	for idx, s := range array {
    91  		result[idx] = d.encodeNullable(s)
    92  	}
    93  	return result
    94  }
    95  
    96  func (d *termDictionary) encodeChunk(array []string) []int64 {
    97  	result := make([]int64, len(array))
    98  	for idx, s := range array {
    99  		result[idx] = d.encode(s)
   100  	}
   101  	return result
   102  }
   103  
   104  func (d *termDictionary) decode(id int64) (string, error) {
   105  	i := int(id)
   106  	if len(d.id) <= i || i < 0 {
   107  		return "", cerror.ErrCraftCodecInvalidData.GenWithStack("invalid term id")
   108  	}
   109  	return d.id[i], nil
   110  }
   111  
   112  func (d *termDictionary) decodeNullable(id int64) (*string, error) {
   113  	if id == nullInt64 {
   114  		return nil, nil
   115  	}
   116  	if id < nullInt64 {
   117  		return nil, cerror.ErrCraftCodecInvalidData.GenWithStack("invalid term id")
   118  	}
   119  	s, err := d.decode(id)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  	return &s, nil
   124  }
   125  
   126  func (d *termDictionary) decodeChunk(array []int64) ([]string, error) {
   127  	result := make([]string, len(array))
   128  	for idx, id := range array {
   129  		t, err := d.decode(id)
   130  		if err != nil {
   131  			return nil, err
   132  		}
   133  		result[idx] = t
   134  	}
   135  	return result, nil
   136  }
   137  
   138  func (d *termDictionary) decodeNullableChunk(array []int64) ([]*string, error) {
   139  	result := make([]*string, len(array))
   140  	for idx, id := range array {
   141  		t, err := d.decodeNullable(id)
   142  		if err != nil {
   143  			return nil, err
   144  		}
   145  		result[idx] = t
   146  	}
   147  	return result, nil
   148  }
   149  
   150  func encodeTermDictionary(bits []byte, dict *termDictionary) []byte {
   151  	if len(dict.id) == 0 {
   152  		return bits
   153  	}
   154  	bits = encodeUvarint(bits, uint64(len(dict.id)))
   155  	bits = encodeStringChunk(bits, dict.id)
   156  	return bits
   157  }
   158  
   159  func decodeTermDictionary(bits []byte, allocator *SliceAllocator) ([]byte, *termDictionary, error) {
   160  	newBits, l, err := decodeUvarint(bits)
   161  	if err != nil {
   162  		return bits, nil, err
   163  	}
   164  	newBits, id, err := decodeStringChunk(newBits, int(l), allocator)
   165  	if err != nil {
   166  		return bits, nil, err
   167  	}
   168  	return newBits, &termDictionary{id: id}, nil
   169  }
   170  
   171  // Headers in columnar layout
   172  type Headers struct {
   173  	ts        []uint64
   174  	ty        []uint64
   175  	partition []int64
   176  	schema    []*string
   177  	table     []*string
   178  
   179  	count int
   180  }
   181  
   182  // Count returns number of headers
   183  func (h *Headers) Count() int {
   184  	return h.count
   185  }
   186  
   187  func (h *Headers) encode(bits []byte, dict *termDictionary) []byte {
   188  	bits = encodeDeltaUvarintChunk(bits, h.ts[:h.count])
   189  	bits = encodeUvarintChunk(bits, h.ty[:h.count])
   190  	bits = encodeDeltaVarintChunk(bits, h.partition[:h.count])
   191  	bits = encodeDeltaVarintChunk(bits, dict.encodeNullableChunk(h.schema[:h.count]))
   192  	bits = encodeDeltaVarintChunk(bits, dict.encodeNullableChunk(h.table[:h.count]))
   193  	return bits
   194  }
   195  
   196  func (h *Headers) appendHeader(allocator *SliceAllocator, ts, ty uint64, partition int64, schema, table *string) int {
   197  	idx := h.count
   198  	if idx+1 > len(h.ty) {
   199  		size := newBufferSize(idx)
   200  		h.ts = allocator.resizeUint64Slice(h.ts, size)
   201  		h.ty = allocator.resizeUint64Slice(h.ty, size)
   202  		h.partition = allocator.resizeInt64Slice(h.partition, size)
   203  		h.schema = allocator.resizeNullableStringSlice(h.schema, size)
   204  		h.table = allocator.resizeNullableStringSlice(h.table, size)
   205  	}
   206  	h.ts[idx] = ts
   207  	h.ty[idx] = ty
   208  	h.partition[idx] = partition
   209  	h.schema[idx] = schema
   210  	h.table[idx] = table
   211  	h.count++
   212  
   213  	return 32 + len(*schema) + len(*table) /* 4 64-bits integers and two bytes array */
   214  }
   215  
   216  func (h *Headers) reset() {
   217  	h.count = 0
   218  }
   219  
   220  // GetType returns type of event at given index
   221  func (h *Headers) GetType(index int) model.MessageType {
   222  	return model.MessageType(h.ty[index])
   223  }
   224  
   225  // GetTs returns timestamp of event at given index
   226  func (h *Headers) GetTs(index int) uint64 {
   227  	return h.ts[index]
   228  }
   229  
   230  // GetPartition returns partition of event at given index
   231  func (h *Headers) GetPartition(index int) int64 {
   232  	return h.partition[index]
   233  }
   234  
   235  // GetSchema returns schema of event at given index
   236  func (h *Headers) GetSchema(index int) string {
   237  	if h.schema[index] != nil {
   238  		return *h.schema[index]
   239  	}
   240  	return ""
   241  }
   242  
   243  // GetTable returns table of event at given index
   244  func (h *Headers) GetTable(index int) string {
   245  	if h.table[index] != nil {
   246  		return *h.table[index]
   247  	}
   248  	return ""
   249  }
   250  
   251  func decodeHeaders(bits []byte, numHeaders int, allocator *SliceAllocator, dict *termDictionary) (*Headers, error) {
   252  	var ts, ty []uint64
   253  	var partition, tmp []int64
   254  	var schema, table []*string
   255  	var err error
   256  	if bits, ts, err = decodeDeltaUvarintChunk(bits, numHeaders, allocator); err != nil {
   257  		return nil, errors.Trace(err)
   258  	}
   259  	if bits, ty, err = decodeUvarintChunk(bits, numHeaders, allocator); err != nil {
   260  		return nil, errors.Trace(err)
   261  	}
   262  	if bits, partition, err = decodeDeltaVarintChunk(bits, numHeaders, allocator); err != nil {
   263  		return nil, errors.Trace(err)
   264  	}
   265  	if bits, tmp, err = decodeDeltaVarintChunk(bits, numHeaders, allocator); err != nil {
   266  		return nil, errors.Trace(err)
   267  	}
   268  	if schema, err = dict.decodeNullableChunk(tmp); err != nil {
   269  		return nil, errors.Trace(err)
   270  	}
   271  	if _, tmp, err = decodeDeltaVarintChunk(bits, numHeaders, allocator); err != nil {
   272  		return nil, errors.Trace(err)
   273  	}
   274  	if table, err = dict.decodeNullableChunk(tmp); err != nil {
   275  		return nil, errors.Trace(err)
   276  	}
   277  	return &Headers{
   278  		ts:        ts,
   279  		ty:        ty,
   280  		partition: partition,
   281  		schema:    schema,
   282  		table:     table,
   283  		count:     numHeaders,
   284  	}, nil
   285  }
   286  
   287  // Column group in columnar layout
   288  type columnGroup struct {
   289  	ty     byte
   290  	names  []string
   291  	types  []uint64
   292  	flags  []uint64
   293  	values [][]byte
   294  }
   295  
   296  func (g *columnGroup) encode(bits []byte, dict *termDictionary) []byte {
   297  	bits = append(bits, g.ty)
   298  	bits = encodeUvarint(bits, uint64(len(g.names)))
   299  	bits = encodeDeltaVarintChunk(bits, dict.encodeChunk(g.names))
   300  	bits = encodeUvarintChunk(bits, g.types)
   301  	bits = encodeUvarintChunk(bits, g.flags)
   302  	bits = encodeNullableBytesChunk(bits, g.values)
   303  	return bits
   304  }
   305  
   306  // ToModel converts column group into model
   307  func (g *columnGroup) ToModel() ([]*model.Column, error) {
   308  	columns := make([]*model.Column, len(g.names))
   309  	for i, name := range g.names {
   310  		ty := byte(g.types[i])
   311  		flag := model.ColumnFlagType(g.flags[i])
   312  		value, err := DecodeTiDBType(ty, flag, g.values[i])
   313  		if err != nil {
   314  			return nil, errors.Trace(err)
   315  		}
   316  		columns[i] = &model.Column{
   317  			Name:  name,
   318  			Type:  ty,
   319  			Flag:  flag,
   320  			Value: value,
   321  		}
   322  	}
   323  	return columns, nil
   324  }
   325  
   326  func decodeColumnGroup(bits []byte, allocator *SliceAllocator, dict *termDictionary) (*columnGroup, error) {
   327  	var numColumns int
   328  	bits, ty, err := decodeUint8(bits)
   329  	if err != nil {
   330  		return nil, errors.Trace(err)
   331  	}
   332  	bits, numColumns, err = decodeUvarintLength(bits)
   333  	if err != nil {
   334  		return nil, errors.Trace(err)
   335  	}
   336  	var names []string
   337  	var tmp []int64
   338  	var values [][]byte
   339  	var types, flags []uint64
   340  	bits, tmp, err = decodeDeltaVarintChunk(bits, numColumns, allocator)
   341  	if err != nil {
   342  		return nil, errors.Trace(err)
   343  	}
   344  	names, err = dict.decodeChunk(tmp)
   345  	if err != nil {
   346  		return nil, errors.Trace(err)
   347  	}
   348  	bits, types, err = decodeUvarintChunk(bits, numColumns, allocator)
   349  	if err != nil {
   350  		return nil, errors.Trace(err)
   351  	}
   352  	bits, flags, err = decodeUvarintChunk(bits, numColumns, allocator)
   353  	if err != nil {
   354  		return nil, errors.Trace(err)
   355  	}
   356  	_, values, err = decodeNullableBytesChunk(bits, numColumns, allocator)
   357  	if err != nil {
   358  		return nil, errors.Trace(err)
   359  	}
   360  	return &columnGroup{
   361  		ty:     ty,
   362  		names:  names,
   363  		types:  types,
   364  		flags:  flags,
   365  		values: values,
   366  	}, nil
   367  }
   368  
   369  func newColumnGroup(allocator *SliceAllocator, ty byte, columns []*model.Column, onlyHandleKeyColumns bool) (int, *columnGroup) {
   370  	l := len(columns)
   371  	if l == 0 {
   372  		return 0, nil
   373  	}
   374  	values := allocator.bytesSlice(l)
   375  	names := allocator.stringSlice(l)
   376  	types := allocator.uint64Slice(l)
   377  	flags := allocator.uint64Slice(l)
   378  	estimatedSize := 0
   379  	idx := 0
   380  	for _, col := range columns {
   381  		if col == nil {
   382  			continue
   383  		}
   384  		if onlyHandleKeyColumns && !col.Flag.IsHandleKey() {
   385  			continue
   386  		}
   387  		names[idx] = col.Name
   388  		types[idx] = uint64(col.Type)
   389  		flags[idx] = uint64(col.Flag)
   390  		value := EncodeTiDBType(allocator, col.Type, col.Flag, col.Value)
   391  		values[idx] = value
   392  		estimatedSize += len(col.Name) + len(value) + 16 /* two 64-bits integers */
   393  		idx++
   394  	}
   395  	if idx > 0 {
   396  		return estimatedSize, &columnGroup{
   397  			ty:     ty,
   398  			names:  names[:idx],
   399  			types:  types[:idx],
   400  			flags:  flags[:idx],
   401  			values: values[:idx],
   402  		}
   403  	}
   404  	return estimatedSize, nil
   405  }
   406  
   407  // Row changed message is basically an array of column groups
   408  type rowChangedEvent = []*columnGroup
   409  
   410  func newRowChangedMessage(allocator *SliceAllocator, ev *model.RowChangedEvent, onlyHandleKeyColumns bool) (int, rowChangedEvent) {
   411  	numGroups := 0
   412  	if ev.PreColumns != nil {
   413  		numGroups++
   414  	}
   415  	if ev.Columns != nil {
   416  		numGroups++
   417  	}
   418  	groups := allocator.columnGroupSlice(numGroups)
   419  	estimatedSize := 0
   420  	idx := 0
   421  	if size, group := newColumnGroup(
   422  		allocator,
   423  		columnGroupTypeNew,
   424  		ev.GetColumns(),
   425  		false); group != nil {
   426  		groups[idx] = group
   427  		idx++
   428  		estimatedSize += size
   429  	}
   430  	onlyHandleKeyColumns = onlyHandleKeyColumns && ev.IsDelete()
   431  	if size, group := newColumnGroup(
   432  		allocator,
   433  		columnGroupTypeOld,
   434  		ev.GetPreColumns(),
   435  		onlyHandleKeyColumns); group != nil {
   436  		groups[idx] = group
   437  		estimatedSize += size
   438  	}
   439  	return estimatedSize, groups
   440  }
   441  
   442  // RowChangedEventBuffer is a buffer to save row changed events in batch
   443  type RowChangedEventBuffer struct {
   444  	headers *Headers
   445  
   446  	events        []rowChangedEvent
   447  	eventsCount   int
   448  	estimatedSize int
   449  
   450  	allocator *SliceAllocator
   451  }
   452  
   453  // NewRowChangedEventBuffer creates new row changed event buffer with given allocator
   454  func NewRowChangedEventBuffer(allocator *SliceAllocator) *RowChangedEventBuffer {
   455  	return &RowChangedEventBuffer{
   456  		headers:   &Headers{},
   457  		allocator: allocator,
   458  	}
   459  }
   460  
   461  // Encode row changed event buffer into bits
   462  func (b *RowChangedEventBuffer) Encode() []byte {
   463  	bits := NewMessageEncoder(b.allocator).encodeHeaders(b.headers).encodeRowChangeEvents(b.events[:b.eventsCount]).Encode()
   464  	b.Reset()
   465  	return bits
   466  }
   467  
   468  // AppendRowChangedEvent append a new event to buffer
   469  func (b *RowChangedEventBuffer) AppendRowChangedEvent(ev *model.RowChangedEvent, onlyHandleKeyColumns bool) (rows, size int) {
   470  	var partition int64 = -1
   471  	if ev.TableInfo.IsPartitionTable() {
   472  		partition = ev.PhysicalTableID
   473  	}
   474  
   475  	var schema, table *string
   476  	if len(ev.TableInfo.GetSchemaName()) > 0 {
   477  		schema = ev.TableInfo.GetSchemaNamePtr()
   478  	}
   479  	if len(ev.TableInfo.GetTableName()) > 0 {
   480  		table = ev.TableInfo.GetTableNamePtr()
   481  	}
   482  
   483  	b.estimatedSize += b.headers.appendHeader(
   484  		b.allocator,
   485  		ev.CommitTs,
   486  		uint64(model.MessageTypeRow),
   487  		partition,
   488  		schema,
   489  		table,
   490  	)
   491  	if b.eventsCount+1 > len(b.events) {
   492  		b.events = b.allocator.resizeRowChangedEventSlice(b.events, newBufferSize(b.eventsCount))
   493  	}
   494  	size, message := newRowChangedMessage(b.allocator, ev, onlyHandleKeyColumns)
   495  	b.events[b.eventsCount] = message
   496  	b.eventsCount++
   497  	b.estimatedSize += size
   498  	return b.eventsCount, b.estimatedSize
   499  }
   500  
   501  // Reset buffer
   502  func (b *RowChangedEventBuffer) Reset() {
   503  	b.headers.reset()
   504  	b.eventsCount = 0
   505  	b.estimatedSize = 0
   506  }
   507  
   508  // Size of buffer
   509  func (b *RowChangedEventBuffer) Size() int {
   510  	return b.estimatedSize
   511  }
   512  
   513  // RowsCount returns number of rows batched in this buffer.
   514  func (b *RowChangedEventBuffer) RowsCount() int {
   515  	return b.eventsCount
   516  }
   517  
   518  // GetHeaders returns headers of buffer
   519  func (b *RowChangedEventBuffer) GetHeaders() *Headers {
   520  	return b.headers
   521  }