github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/reader.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package mydump 15 16 import ( 17 "bufio" 18 "bytes" 19 "context" 20 "io" 21 "strings" 22 "unicode/utf8" 23 24 "github.com/pingcap/br/pkg/storage" 25 "go.uber.org/zap" 26 27 "github.com/pingcap/errors" 28 "golang.org/x/text/encoding/simplifiedchinese" 29 30 "github.com/pingcap/tidb-lightning/lightning/log" 31 "github.com/pingcap/tidb-lightning/lightning/worker" 32 ) 33 34 var ( 35 ErrInsertStatementNotFound = errors.New("insert statement not found") 36 errInvalidSchemaEncoding = errors.New("invalid schema encoding") 37 ) 38 39 func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) { 40 switch characterSet { 41 case "binary": 42 // do nothing 43 case "auto", "utf8mb4": 44 if utf8.Valid(data) { 45 break 46 } 47 if characterSet == "utf8mb4" { 48 return nil, errInvalidSchemaEncoding 49 } 50 // try gb18030 next if the encoding is "auto" 51 // if we support too many encodings, consider switching strategy to 52 // perform `chardet` first. 53 fallthrough 54 case "gb18030": 55 decoded, err := simplifiedchinese.GB18030.NewDecoder().Bytes(data) 56 if err != nil { 57 return nil, errors.Trace(err) 58 } 59 // check for U+FFFD to see if decoding contains errors. 60 // https://groups.google.com/d/msg/golang-nuts/pENT3i4zJYk/v2X3yyiICwAJ 61 if bytes.ContainsRune(decoded, '\ufffd') { 62 return nil, errInvalidSchemaEncoding 63 } 64 data = decoded 65 default: 66 return nil, errors.Errorf("Unsupported encoding %s", characterSet) 67 } 68 return data, nil 69 } 70 71 func ExportStatement(ctx context.Context, store storage.ExternalStorage, sqlFile FileInfo, characterSet string) ([]byte, error) { 72 fd, err := store.Open(ctx, sqlFile.FileMeta.Path) 73 if err != nil { 74 return nil, errors.Trace(err) 75 } 76 defer fd.Close() 77 78 br := bufio.NewReader(fd) 79 80 data := make([]byte, 0, sqlFile.FileMeta.FileSize+1) 81 buffer := make([]byte, 0, sqlFile.FileMeta.FileSize+1) 82 for { 83 line, err := br.ReadBytes('\n') 84 if errors.Cause(err) == io.EOF { 85 if len(line) == 0 { // it will return EOF if there is no trailing new line. 86 break 87 } 88 } else if err != nil { 89 return nil, errors.Trace(err) 90 } 91 92 line = bytes.TrimSpace(line) 93 if len(line) == 0 { 94 continue 95 } 96 97 buffer = append(buffer, line...) 98 if buffer[len(buffer)-1] == ';' { 99 statement := string(buffer) 100 if !(strings.HasPrefix(statement, "/*") && strings.HasSuffix(statement, "*/;")) { 101 data = append(data, buffer...) 102 } 103 buffer = buffer[:0] 104 } else { 105 buffer = append(buffer, '\n') 106 } 107 } 108 109 data, err = decodeCharacterSet(data, characterSet) 110 if err != nil { 111 log.L().Error("cannot decode input file, please convert to target encoding manually", 112 zap.String("encoding", characterSet), 113 zap.String("Path", sqlFile.FileMeta.Path), 114 ) 115 return nil, errors.Annotatef(err, "failed to decode %s as %s", sqlFile.FileMeta.Path, characterSet) 116 } 117 return data, nil 118 } 119 120 // ReadSeekCloser = Reader + Seeker + Closer 121 type ReadSeekCloser interface { 122 io.Reader 123 io.Seeker 124 io.Closer 125 } 126 127 // StringReader is a wrapper around *strings.Reader with an additional Close() method 128 type StringReader struct{ *strings.Reader } 129 130 // NewStringReader constructs a new StringReader 131 func NewStringReader(s string) StringReader { 132 return StringReader{Reader: strings.NewReader(s)} 133 } 134 135 // Close implements io.Closer 136 func (sr StringReader) Close() error { 137 return nil 138 } 139 140 // PooledReader is a throttled reader wrapper, where Read() calls have an upper limit of concurrency 141 // imposed by the given worker pool. 142 type PooledReader struct { 143 reader ReadSeekCloser 144 ioWorkers *worker.Pool 145 } 146 147 // MakePooledReader constructs a new PooledReader. 148 func MakePooledReader(reader ReadSeekCloser, ioWorkers *worker.Pool) PooledReader { 149 return PooledReader{ 150 reader: reader, 151 ioWorkers: ioWorkers, 152 } 153 } 154 155 // Read implements io.Reader 156 func (pr PooledReader) Read(p []byte) (n int, err error) { 157 w := pr.ioWorkers.Apply() 158 defer pr.ioWorkers.Recycle(w) 159 return pr.reader.Read(p) 160 } 161 162 // Seek implements io.Seeker 163 func (pr PooledReader) Seek(offset int64, whence int) (int64, error) { 164 w := pr.ioWorkers.Apply() 165 defer pr.ioWorkers.Recycle(w) 166 return pr.reader.Seek(offset, whence) 167 } 168 169 // Close implements io.Closer 170 func (pr PooledReader) Close() error { 171 return pr.reader.Close() 172 } 173 174 // ReadFull is same as `io.ReadFull(pr)` with less worker recycling 175 func (pr PooledReader) ReadFull(buf []byte) (n int, err error) { 176 w := pr.ioWorkers.Apply() 177 defer pr.ioWorkers.Recycle(w) 178 return io.ReadFull(pr.reader, buf) 179 }