github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/reader.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package mydump
    15  
    16  import (
    17  	"bufio"
    18  	"bytes"
    19  	"context"
    20  	"io"
    21  	"strings"
    22  	"unicode/utf8"
    23  
    24  	"github.com/pingcap/br/pkg/storage"
    25  	"go.uber.org/zap"
    26  
    27  	"github.com/pingcap/errors"
    28  	"golang.org/x/text/encoding/simplifiedchinese"
    29  
    30  	"github.com/pingcap/tidb-lightning/lightning/log"
    31  	"github.com/pingcap/tidb-lightning/lightning/worker"
    32  )
    33  
    34  var (
    35  	ErrInsertStatementNotFound = errors.New("insert statement not found")
    36  	errInvalidSchemaEncoding   = errors.New("invalid schema encoding")
    37  )
    38  
    39  func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) {
    40  	switch characterSet {
    41  	case "binary":
    42  		// do nothing
    43  	case "auto", "utf8mb4":
    44  		if utf8.Valid(data) {
    45  			break
    46  		}
    47  		if characterSet == "utf8mb4" {
    48  			return nil, errInvalidSchemaEncoding
    49  		}
    50  		// try gb18030 next if the encoding is "auto"
    51  		// if we support too many encodings, consider switching strategy to
    52  		// perform `chardet` first.
    53  		fallthrough
    54  	case "gb18030":
    55  		decoded, err := simplifiedchinese.GB18030.NewDecoder().Bytes(data)
    56  		if err != nil {
    57  			return nil, errors.Trace(err)
    58  		}
    59  		// check for U+FFFD to see if decoding contains errors.
    60  		// https://groups.google.com/d/msg/golang-nuts/pENT3i4zJYk/v2X3yyiICwAJ
    61  		if bytes.ContainsRune(decoded, '\ufffd') {
    62  			return nil, errInvalidSchemaEncoding
    63  		}
    64  		data = decoded
    65  	default:
    66  		return nil, errors.Errorf("Unsupported encoding %s", characterSet)
    67  	}
    68  	return data, nil
    69  }
    70  
    71  func ExportStatement(ctx context.Context, store storage.ExternalStorage, sqlFile FileInfo, characterSet string) ([]byte, error) {
    72  	fd, err := store.Open(ctx, sqlFile.FileMeta.Path)
    73  	if err != nil {
    74  		return nil, errors.Trace(err)
    75  	}
    76  	defer fd.Close()
    77  
    78  	br := bufio.NewReader(fd)
    79  
    80  	data := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
    81  	buffer := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
    82  	for {
    83  		line, err := br.ReadBytes('\n')
    84  		if errors.Cause(err) == io.EOF {
    85  			if len(line) == 0 { // it will return EOF if there is no trailing new line.
    86  				break
    87  			}
    88  		} else if err != nil {
    89  			return nil, errors.Trace(err)
    90  		}
    91  
    92  		line = bytes.TrimSpace(line)
    93  		if len(line) == 0 {
    94  			continue
    95  		}
    96  
    97  		buffer = append(buffer, line...)
    98  		if buffer[len(buffer)-1] == ';' {
    99  			statement := string(buffer)
   100  			if !(strings.HasPrefix(statement, "/*") && strings.HasSuffix(statement, "*/;")) {
   101  				data = append(data, buffer...)
   102  			}
   103  			buffer = buffer[:0]
   104  		} else {
   105  			buffer = append(buffer, '\n')
   106  		}
   107  	}
   108  
   109  	data, err = decodeCharacterSet(data, characterSet)
   110  	if err != nil {
   111  		log.L().Error("cannot decode input file, please convert to target encoding manually",
   112  			zap.String("encoding", characterSet),
   113  			zap.String("Path", sqlFile.FileMeta.Path),
   114  		)
   115  		return nil, errors.Annotatef(err, "failed to decode %s as %s", sqlFile.FileMeta.Path, characterSet)
   116  	}
   117  	return data, nil
   118  }
   119  
   120  // ReadSeekCloser = Reader + Seeker + Closer
   121  type ReadSeekCloser interface {
   122  	io.Reader
   123  	io.Seeker
   124  	io.Closer
   125  }
   126  
   127  // StringReader is a wrapper around *strings.Reader with an additional Close() method
   128  type StringReader struct{ *strings.Reader }
   129  
   130  // NewStringReader constructs a new StringReader
   131  func NewStringReader(s string) StringReader {
   132  	return StringReader{Reader: strings.NewReader(s)}
   133  }
   134  
   135  // Close implements io.Closer
   136  func (sr StringReader) Close() error {
   137  	return nil
   138  }
   139  
   140  // PooledReader is a throttled reader wrapper, where Read() calls have an upper limit of concurrency
   141  // imposed by the given worker pool.
   142  type PooledReader struct {
   143  	reader    ReadSeekCloser
   144  	ioWorkers *worker.Pool
   145  }
   146  
   147  // MakePooledReader constructs a new PooledReader.
   148  func MakePooledReader(reader ReadSeekCloser, ioWorkers *worker.Pool) PooledReader {
   149  	return PooledReader{
   150  		reader:    reader,
   151  		ioWorkers: ioWorkers,
   152  	}
   153  }
   154  
   155  // Read implements io.Reader
   156  func (pr PooledReader) Read(p []byte) (n int, err error) {
   157  	w := pr.ioWorkers.Apply()
   158  	defer pr.ioWorkers.Recycle(w)
   159  	return pr.reader.Read(p)
   160  }
   161  
   162  // Seek implements io.Seeker
   163  func (pr PooledReader) Seek(offset int64, whence int) (int64, error) {
   164  	w := pr.ioWorkers.Apply()
   165  	defer pr.ioWorkers.Recycle(w)
   166  	return pr.reader.Seek(offset, whence)
   167  }
   168  
   169  // Close implements io.Closer
   170  func (pr PooledReader) Close() error {
   171  	return pr.reader.Close()
   172  }
   173  
   174  // ReadFull is same as `io.ReadFull(pr)` with less worker recycling
   175  func (pr PooledReader) ReadFull(buf []byte) (n int, err error) {
   176  	w := pr.ioWorkers.Apply()
   177  	defer pr.ioWorkers.Recycle(w)
   178  	return io.ReadFull(pr.reader, buf)
   179  }