github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/sink/codec/csv/csv_decoder.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package csv
    15  
    16  import (
    17  	"context"
    18  	"io"
    19  
    20  	"github.com/pingcap/errors"
    21  	lconfig "github.com/pingcap/tidb/pkg/lightning/config"
    22  	"github.com/pingcap/tidb/pkg/lightning/mydump"
    23  	"github.com/pingcap/tidb/pkg/lightning/worker"
    24  	"github.com/pingcap/tiflow/cdc/model"
    25  	cerror "github.com/pingcap/tiflow/pkg/errors"
    26  	"github.com/pingcap/tiflow/pkg/sink/codec"
    27  	"github.com/pingcap/tiflow/pkg/sink/codec/common"
    28  )
    29  
    30  const defaultIOConcurrency = 1
    31  
    32  type batchDecoder struct {
    33  	codecConfig *common.Config
    34  	parser      *mydump.CSVParser
    35  	data        []byte
    36  	msg         *csvMessage
    37  	tableInfo   *model.TableInfo
    38  	closed      bool
    39  }
    40  
    41  // NewBatchDecoder creates a new BatchDecoder
    42  func NewBatchDecoder(ctx context.Context,
    43  	codecConfig *common.Config,
    44  	tableInfo *model.TableInfo,
    45  	value []byte,
    46  ) (codec.RowEventDecoder, error) {
    47  	var backslashEscape bool
    48  
    49  	// if quote is not set in config, we should unespace backslash
    50  	// when parsing csv columns.
    51  	if len(codecConfig.Quote) == 0 {
    52  		backslashEscape = true
    53  	}
    54  	cfg := &lconfig.CSVConfig{
    55  		Separator:       codecConfig.Delimiter,
    56  		Delimiter:       codecConfig.Quote,
    57  		Terminator:      codecConfig.Terminator,
    58  		Null:            []string{codecConfig.NullString},
    59  		BackslashEscape: backslashEscape,
    60  	}
    61  	csvParser, err := mydump.NewCSVParser(ctx, cfg,
    62  		mydump.NewStringReader(string(value)),
    63  		int64(lconfig.ReadBlockSize),
    64  		worker.NewPool(ctx, defaultIOConcurrency, "io"), false, nil)
    65  	if err != nil {
    66  		return nil, err
    67  	}
    68  	return &batchDecoder{
    69  		codecConfig: codecConfig,
    70  		tableInfo:   tableInfo,
    71  		data:        value,
    72  		msg:         newCSVMessage(codecConfig),
    73  		parser:      csvParser,
    74  	}, nil
    75  }
    76  
    77  // AddKeyValue implements the RowEventDecoder interface.
    78  func (b *batchDecoder) AddKeyValue(_, _ []byte) error {
    79  	return nil
    80  }
    81  
    82  // HasNext implements the RowEventDecoder interface.
    83  func (b *batchDecoder) HasNext() (model.MessageType, bool, error) {
    84  	err := b.parser.ReadRow()
    85  	if err != nil {
    86  		b.closed = true
    87  		if errors.Cause(err) == io.EOF {
    88  			return model.MessageTypeUnknown, false, nil
    89  		}
    90  		return model.MessageTypeUnknown, false, err
    91  	}
    92  
    93  	row := b.parser.LastRow()
    94  	if err = b.msg.decode(row.Row); err != nil {
    95  		return model.MessageTypeUnknown, false, errors.Trace(err)
    96  	}
    97  
    98  	return model.MessageTypeRow, true, nil
    99  }
   100  
   101  // NextResolvedEvent implements the RowEventDecoder interface.
   102  func (b *batchDecoder) NextResolvedEvent() (uint64, error) {
   103  	return 0, nil
   104  }
   105  
   106  // NextRowChangedEvent implements the RowEventDecoder interface.
   107  func (b *batchDecoder) NextRowChangedEvent() (*model.RowChangedEvent, error) {
   108  	if b.closed {
   109  		return nil, cerror.WrapError(cerror.ErrCSVDecodeFailed, errors.New("no csv row can be found"))
   110  	}
   111  
   112  	e, err := csvMsg2RowChangedEvent(b.codecConfig, b.msg, b.tableInfo)
   113  	if err != nil {
   114  		return nil, errors.Trace(err)
   115  	}
   116  	return e, nil
   117  }
   118  
   119  // NextDDLEvent implements the RowEventDecoder interface.
   120  func (b *batchDecoder) NextDDLEvent() (*model.DDLEvent, error) {
   121  	return nil, nil
   122  }