github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/reader.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package mydump
    15  
    16  import (
    17  	"bufio"
    18  	"bytes"
    19  	"context"
    20  	"io"
    21  	"strings"
    22  	"unicode/utf8"
    23  
    24  	"github.com/pingcap/errors"
    25  	"go.uber.org/zap"
    26  	"golang.org/x/text/encoding/simplifiedchinese"
    27  
    28  	"github.com/pingcap/br/pkg/lightning/log"
    29  	"github.com/pingcap/br/pkg/lightning/worker"
    30  	"github.com/pingcap/br/pkg/storage"
    31  )
    32  
    33  var (
    34  	ErrInsertStatementNotFound = errors.New("insert statement not found")
    35  	errInvalidSchemaEncoding   = errors.New("invalid schema encoding")
    36  )
    37  
    38  func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) {
    39  	switch characterSet {
    40  	case "binary":
    41  		// do nothing
    42  	case "auto", "utf8mb4":
    43  		if utf8.Valid(data) {
    44  			break
    45  		}
    46  		if characterSet == "utf8mb4" {
    47  			return nil, errInvalidSchemaEncoding
    48  		}
    49  		// try gb18030 next if the encoding is "auto"
    50  		// if we support too many encodings, consider switching strategy to
    51  		// perform `chardet` first.
    52  		fallthrough
    53  	case "gb18030":
    54  		decoded, err := simplifiedchinese.GB18030.NewDecoder().Bytes(data)
    55  		if err != nil {
    56  			return nil, errors.Trace(err)
    57  		}
    58  		// check for U+FFFD to see if decoding contains errors.
    59  		// https://groups.google.com/d/msg/golang-nuts/pENT3i4zJYk/v2X3yyiICwAJ
    60  		if bytes.ContainsRune(decoded, '\ufffd') {
    61  			return nil, errInvalidSchemaEncoding
    62  		}
    63  		data = decoded
    64  	default:
    65  		return nil, errors.Errorf("Unsupported encoding %s", characterSet)
    66  	}
    67  	return data, nil
    68  }
    69  
    70  func ExportStatement(ctx context.Context, store storage.ExternalStorage, sqlFile FileInfo, characterSet string) ([]byte, error) {
    71  	fd, err := store.Open(ctx, sqlFile.FileMeta.Path)
    72  	if err != nil {
    73  		return nil, errors.Trace(err)
    74  	}
    75  	defer fd.Close()
    76  
    77  	br := bufio.NewReader(fd)
    78  
    79  	data := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
    80  	buffer := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
    81  	for {
    82  		line, err := br.ReadBytes('\n')
    83  		if errors.Cause(err) == io.EOF {
    84  			if len(line) == 0 { // it will return EOF if there is no trailing new line.
    85  				break
    86  			}
    87  		} else if err != nil {
    88  			return nil, errors.Trace(err)
    89  		}
    90  
    91  		line = bytes.TrimSpace(line)
    92  		if len(line) == 0 {
    93  			continue
    94  		}
    95  
    96  		buffer = append(buffer, line...)
    97  		if buffer[len(buffer)-1] == ';' {
    98  			statement := string(buffer)
    99  			if !(strings.HasPrefix(statement, "/*") && strings.HasSuffix(statement, "*/;")) {
   100  				data = append(data, buffer...)
   101  			}
   102  			buffer = buffer[:0]
   103  		} else {
   104  			buffer = append(buffer, '\n')
   105  		}
   106  	}
   107  
   108  	data, err = decodeCharacterSet(data, characterSet)
   109  	if err != nil {
   110  		log.L().Error("cannot decode input file, please convert to target encoding manually",
   111  			zap.String("encoding", characterSet),
   112  			zap.String("Path", sqlFile.FileMeta.Path),
   113  		)
   114  		return nil, errors.Annotatef(err, "failed to decode %s as %s", sqlFile.FileMeta.Path, characterSet)
   115  	}
   116  	return data, nil
   117  }
   118  
   119  // ReadSeekCloser = Reader + Seeker + Closer
   120  type ReadSeekCloser interface {
   121  	io.Reader
   122  	io.Seeker
   123  	io.Closer
   124  }
   125  
   126  // StringReader is a wrapper around *strings.Reader with an additional Close() method
   127  type StringReader struct{ *strings.Reader }
   128  
   129  // NewStringReader constructs a new StringReader
   130  func NewStringReader(s string) StringReader {
   131  	return StringReader{Reader: strings.NewReader(s)}
   132  }
   133  
   134  // Close implements io.Closer
   135  func (sr StringReader) Close() error {
   136  	return nil
   137  }
   138  
   139  // PooledReader is a throttled reader wrapper, where Read() calls have an upper limit of concurrency
   140  // imposed by the given worker pool.
   141  type PooledReader struct {
   142  	reader    ReadSeekCloser
   143  	ioWorkers *worker.Pool
   144  }
   145  
   146  // MakePooledReader constructs a new PooledReader.
   147  func MakePooledReader(reader ReadSeekCloser, ioWorkers *worker.Pool) PooledReader {
   148  	return PooledReader{
   149  		reader:    reader,
   150  		ioWorkers: ioWorkers,
   151  	}
   152  }
   153  
   154  // Read implements io.Reader
   155  func (pr PooledReader) Read(p []byte) (n int, err error) {
   156  	w := pr.ioWorkers.Apply()
   157  	defer pr.ioWorkers.Recycle(w)
   158  	return pr.reader.Read(p)
   159  }
   160  
   161  // Seek implements io.Seeker
   162  func (pr PooledReader) Seek(offset int64, whence int) (int64, error) {
   163  	w := pr.ioWorkers.Apply()
   164  	defer pr.ioWorkers.Recycle(w)
   165  	return pr.reader.Seek(offset, whence)
   166  }
   167  
   168  // Close implements io.Closer
   169  func (pr PooledReader) Close() error {
   170  	return pr.reader.Close()
   171  }
   172  
   173  // ReadFull is same as `io.ReadFull(pr)` with less worker recycling
   174  func (pr PooledReader) ReadFull(buf []byte) (n int, err error) {
   175  	w := pr.ioWorkers.Apply()
   176  	defer pr.ioWorkers.Recycle(w)
   177  	return io.ReadFull(pr.reader, buf)
   178  }