github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/reader.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump 15 16 import ( 17 "bufio" 18 "bytes" 19 "context" 20 "io" 21 "strings" 22 "unicode/utf8" 23 24 "github.com/pingcap/errors" 25 "go.uber.org/zap" 26 "golang.org/x/text/encoding/simplifiedchinese" 27 28 "github.com/pingcap/br/pkg/lightning/log" 29 "github.com/pingcap/br/pkg/lightning/worker" 30 "github.com/pingcap/br/pkg/storage" 31 ) 32 33 var ( 34 ErrInsertStatementNotFound = errors.New("insert statement not found") 35 errInvalidSchemaEncoding = errors.New("invalid schema encoding") 36 ) 37 38 func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) { 39 switch characterSet { 40 case "binary": 41 // do nothing 42 case "auto", "utf8mb4": 43 if utf8.Valid(data) { 44 break 45 } 46 if characterSet == "utf8mb4" { 47 return nil, errInvalidSchemaEncoding 48 } 49 // try gb18030 next if the encoding is "auto" 50 // if we support too many encodings, consider switching strategy to 51 // perform `chardet` first. 52 fallthrough 53 case "gb18030": 54 decoded, err := simplifiedchinese.GB18030.NewDecoder().Bytes(data) 55 if err != nil { 56 return nil, errors.Trace(err) 57 } 58 // check for U+FFFD to see if decoding contains errors. 59 // https://groups.google.com/d/msg/golang-nuts/pENT3i4zJYk/v2X3yyiICwAJ 60 if bytes.ContainsRune(decoded, '\ufffd') { 61 return nil, errInvalidSchemaEncoding 62 } 63 data = decoded 64 default: 65 return nil, errors.Errorf("Unsupported encoding %s", characterSet) 66 } 67 return data, nil 68 } 69 70 func ExportStatement(ctx context.Context, store storage.ExternalStorage, sqlFile FileInfo, characterSet string) ([]byte, error) { 71 fd, err := store.Open(ctx, sqlFile.FileMeta.Path) 72 if err != nil { 73 return nil, errors.Trace(err) 74 } 75 defer fd.Close() 76 77 br := bufio.NewReader(fd) 78 79 data := make([]byte, 0, sqlFile.FileMeta.FileSize+1) 80 buffer := make([]byte, 0, sqlFile.FileMeta.FileSize+1) 81 for { 82 line, err := br.ReadBytes('\n') 83 if errors.Cause(err) == io.EOF { 84 if len(line) == 0 { // it will return EOF if there is no trailing new line. 85 break 86 } 87 } else if err != nil { 88 return nil, errors.Trace(err) 89 } 90 91 line = bytes.TrimSpace(line) 92 if len(line) == 0 { 93 continue 94 } 95 96 buffer = append(buffer, line...) 97 if buffer[len(buffer)-1] == ';' { 98 statement := string(buffer) 99 if !(strings.HasPrefix(statement, "/*") && strings.HasSuffix(statement, "*/;")) { 100 data = append(data, buffer...) 101 } 102 buffer = buffer[:0] 103 } else { 104 buffer = append(buffer, '\n') 105 } 106 } 107 108 data, err = decodeCharacterSet(data, characterSet) 109 if err != nil { 110 log.L().Error("cannot decode input file, please convert to target encoding manually", 111 zap.String("encoding", characterSet), 112 zap.String("Path", sqlFile.FileMeta.Path), 113 ) 114 return nil, errors.Annotatef(err, "failed to decode %s as %s", sqlFile.FileMeta.Path, characterSet) 115 } 116 return data, nil 117 } 118 119 // ReadSeekCloser = Reader + Seeker + Closer 120 type ReadSeekCloser interface { 121 io.Reader 122 io.Seeker 123 io.Closer 124 } 125 126 // StringReader is a wrapper around *strings.Reader with an additional Close() method 127 type StringReader struct{ *strings.Reader } 128 129 // NewStringReader constructs a new StringReader 130 func NewStringReader(s string) StringReader { 131 return StringReader{Reader: strings.NewReader(s)} 132 } 133 134 // Close implements io.Closer 135 func (sr StringReader) Close() error { 136 return nil 137 } 138 139 // PooledReader is a throttled reader wrapper, where Read() calls have an upper limit of concurrency 140 // imposed by the given worker pool. 141 type PooledReader struct { 142 reader ReadSeekCloser 143 ioWorkers *worker.Pool 144 } 145 146 // MakePooledReader constructs a new PooledReader. 147 func MakePooledReader(reader ReadSeekCloser, ioWorkers *worker.Pool) PooledReader { 148 return PooledReader{ 149 reader: reader, 150 ioWorkers: ioWorkers, 151 } 152 } 153 154 // Read implements io.Reader 155 func (pr PooledReader) Read(p []byte) (n int, err error) { 156 w := pr.ioWorkers.Apply() 157 defer pr.ioWorkers.Recycle(w) 158 return pr.reader.Read(p) 159 } 160 161 // Seek implements io.Seeker 162 func (pr PooledReader) Seek(offset int64, whence int) (int64, error) { 163 w := pr.ioWorkers.Apply() 164 defer pr.ioWorkers.Recycle(w) 165 return pr.reader.Seek(offset, whence) 166 } 167 168 // Close implements io.Closer 169 func (pr PooledReader) Close() error { 170 return pr.reader.Close() 171 } 172 173 // ReadFull is same as `io.ReadFull(pr)` with less worker recycling 174 func (pr PooledReader) ReadFull(buf []byte) (n int, err error) { 175 w := pr.ioWorkers.Apply() 176 defer pr.ioWorkers.Recycle(w) 177 return io.ReadFull(pr.reader, buf) 178 }