github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/read_import_mysqlout.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package importccl 10 11 import ( 12 "bufio" 13 "context" 14 "fmt" 15 "io" 16 "unicode" 17 18 "github.com/cockroachdb/cockroach/pkg/roachpb" 19 "github.com/cockroachdb/cockroach/pkg/sql/row" 20 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 21 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 22 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 23 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 24 "github.com/cockroachdb/errors" 25 ) 26 27 type mysqloutfileReader struct { 28 importCtx *parallelImportContext 29 opts roachpb.MySQLOutfileOptions 30 } 31 32 var _ inputConverter = &mysqloutfileReader{} 33 34 func newMysqloutfileReader( 35 opts roachpb.MySQLOutfileOptions, 36 kvCh chan row.KVBatch, 37 walltime int64, 38 parallelism int, 39 tableDesc *sqlbase.TableDescriptor, 40 evalCtx *tree.EvalContext, 41 ) (*mysqloutfileReader, error) { 42 return &mysqloutfileReader{ 43 importCtx: ¶llelImportContext{ 44 walltime: walltime, 45 numWorkers: parallelism, 46 evalCtx: evalCtx, 47 tableDesc: tableDesc, 48 kvCh: kvCh, 49 }, 50 opts: opts, 51 }, nil 52 } 53 54 func (d *mysqloutfileReader) start(ctx ctxgroup.Group) { 55 } 56 57 func (d *mysqloutfileReader) readFiles( 58 ctx context.Context, 59 dataFiles map[int32]string, 60 resumePos map[int32]int64, 61 format roachpb.IOFileFormat, 62 makeExternalStorage cloud.ExternalStorageFactory, 63 ) error { 64 return readInputFiles(ctx, dataFiles, resumePos, format, d.readFile, makeExternalStorage) 65 } 66 67 type delimitedProducer struct { 68 importCtx *parallelImportContext 69 opts *roachpb.MySQLOutfileOptions 70 input *fileReader 71 reader *bufio.Reader 72 row []rune 73 err error 74 eof bool 75 } 76 77 var _ importRowProducer = &delimitedProducer{} 78 79 // Scan implements importRowProducer 80 func (d *delimitedProducer) Scan() bool { 81 d.row = nil 82 var r rune 83 var w int 84 nextLiteral := false 85 fieldEnclosed := false 86 87 for { 88 r, w, d.err = d.reader.ReadRune() 89 if d.err == io.EOF { 90 d.eof = true 91 d.err = nil 92 } 93 94 if d.eof { 95 if d.row != nil { 96 return true 97 } 98 if nextLiteral { 99 d.err = io.ErrUnexpectedEOF 100 } 101 return false 102 } 103 104 if d.err != nil { 105 return false 106 } 107 108 if r == unicode.ReplacementChar && w == 1 { 109 if d.err = d.reader.UnreadRune(); d.err != nil { 110 return false 111 } 112 var raw byte 113 raw, d.err = d.reader.ReadByte() 114 if d.err != nil { 115 return false 116 } 117 r = rune(raw) 118 } 119 120 if r == d.opts.RowSeparator && !nextLiteral && !fieldEnclosed { 121 return true 122 } 123 124 d.row = append(d.row, r) 125 126 if d.opts.HasEscape { 127 nextLiteral = !nextLiteral && r == d.opts.Escape 128 } 129 130 if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && r == d.opts.Encloser { 131 // We only care about well formed, enclosed fields (i.e. ones that start with 132 // enclose rune. If we see enclose character anywhere else, then we either 133 // close the opened enclosing, or we treat this as an invalid enclosing, 134 // and let FillDatums below take care of reporting and handling any errors. 135 fieldEnclosed = len(d.row) == 1 136 } 137 } 138 } 139 140 // Err implements importRowProducer 141 func (d *delimitedProducer) Err() error { 142 return d.err 143 } 144 145 // Skip implements importRowProducer 146 func (d *delimitedProducer) Skip() error { 147 return nil // no-op 148 } 149 150 // Row implements importRowProducer 151 func (d *delimitedProducer) Row() (interface{}, error) { 152 return d.row, d.err 153 } 154 155 // Progress implements importRowProducer 156 func (d *delimitedProducer) Progress() float32 { 157 return d.input.ReadFraction() 158 } 159 160 type delimitedConsumer struct { 161 opts *roachpb.MySQLOutfileOptions 162 } 163 164 var _ importRowConsumer = &delimitedConsumer{} 165 166 // FillDatums implements importRowConsumer 167 func (d *delimitedConsumer) FillDatums( 168 input interface{}, rowNum int64, conv *row.DatumRowConverter, 169 ) error { 170 data := input.([]rune) 171 172 // The current field being read needs to be a list to be able to undo 173 // field enclosures at end of field. 174 var fieldParts []rune 175 176 // If we have an escaping char defined, seeing it means the next char is to be 177 // treated as escaped -- usually that means literal but has some specific 178 // mappings defined as well. 179 var nextLiteral bool 180 181 // If we have an enclosing char defined, seeing it begins reading a field -- 182 // which means we do not look for separators until we see the end of the field 183 // as indicated by the matching enclosing char. 184 var readingField bool 185 186 // If we have just encountered a potential encloser symbol. 187 // That means if an end of field or line is next we should honor it. 188 var gotEncloser bool 189 190 var gotNull bool 191 192 var datumIdx int 193 194 addField := func() error { 195 defer func() { 196 fieldParts = fieldParts[:0] 197 readingField = false 198 gotEncloser = false 199 }() 200 if nextLiteral { 201 return newImportRowError(errors.New("unmatched literal"), string(data), rowNum) 202 } 203 204 var datum tree.Datum 205 206 // If previous symbol was field encloser it should be 207 // dropped as it only marks end of field. Otherwise 208 // throw an error since we don;t expect unmatched encloser. 209 if gotEncloser { 210 // If the encloser marked end of field 211 // drop it. 212 if readingField { 213 fieldParts = fieldParts[:len(fieldParts)-1] 214 } else { 215 // Unexpected since we did not see one at start of field. 216 gotEncloser = false 217 return newImportRowError(errors.New("unmatched field enclosure at end of field"), 218 string(data), rowNum) 219 } 220 } else if readingField { 221 return newImportRowError(errors.New("unmatched field enclosure at start of field"), 222 string(data), rowNum) 223 } 224 field := string(fieldParts) 225 if datumIdx >= len(conv.VisibleCols) { 226 return newImportRowError( 227 fmt.Errorf("too many columns, got %d expected %d", datumIdx+1, len(conv.VisibleCols)), 228 string(data), rowNum) 229 } 230 231 if gotNull { 232 gotNull = false 233 if len(field) != 0 { 234 return newImportRowError(fmt.Errorf("unexpected data after null encoding: %q", field), 235 string(data), rowNum) 236 } 237 datum = tree.DNull 238 } else if (!d.opts.HasEscape && field == "NULL") || d.opts.NullEncoding != nil && field == *d.opts.NullEncoding { 239 datum = tree.DNull 240 } else { 241 // This uses ParseDatumStringAsWithRawBytes instead of ParseDatumStringAs since mysql emits 242 // raw byte strings that do not use the same escaping as our ParseBytes 243 // function expects, and the difference between ParseStringAs and 244 // ParseDatumStringAs is whether or not it attempts to parse bytes. 245 var err error 246 datum, err = sqlbase.ParseDatumStringAsWithRawBytes(conv.VisibleColTypes[datumIdx], field, conv.EvalCtx) 247 if err != nil { 248 col := conv.VisibleCols[datumIdx] 249 return newImportRowError( 250 fmt.Errorf("error %s while parse %q as %s", err, col.Name, col.Type.SQLString()), 251 string(data), rowNum) 252 } 253 } 254 conv.Datums[datumIdx] = datum 255 datumIdx++ 256 return nil 257 } 258 259 // Main parsing loop body, returns true to indicate unrecoverable error. 260 // We are being conservative and treating most errors as unrecoverable for now. 261 for _, c := range data { 262 // Do we need to check for escaping? 263 if d.opts.HasEscape { 264 if nextLiteral { 265 nextLiteral = false 266 // See https://dev.mysql.com/doc/refman/8.0/en/load-data.html. 267 switch c { 268 case '0': 269 fieldParts = append(fieldParts, rune(0)) 270 case 'b': 271 fieldParts = append(fieldParts, rune('\b')) 272 case 'n': 273 fieldParts = append(fieldParts, rune('\n')) 274 case 'r': 275 fieldParts = append(fieldParts, rune('\r')) 276 case 't': 277 fieldParts = append(fieldParts, rune('\t')) 278 case 'Z': 279 fieldParts = append(fieldParts, rune(byte(26))) 280 case 'N': 281 if gotNull { 282 return newImportRowError(errors.New("unexpected null encoding"), string(data), rowNum) 283 } 284 gotNull = true 285 default: 286 fieldParts = append(fieldParts, c) 287 } 288 gotEncloser = false 289 continue 290 } 291 292 if c == d.opts.Escape { 293 nextLiteral = true 294 gotEncloser = false 295 continue 296 } 297 } 298 299 // Are we done with the field, or even the whole row? 300 if (!readingField || gotEncloser) && c == d.opts.FieldSeparator { 301 if err := addField(); err != nil { 302 return err 303 } 304 continue 305 } 306 307 if gotEncloser { 308 gotEncloser = false 309 } 310 311 // If enclosing is not disabled, check for the encloser. 312 // Technically when it is not optional, we could _require_ it to start and 313 // end fields, but for the purposes of decoding, we don't actually care -- 314 // we'll handle it if we see it either way. 315 if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && c == d.opts.Encloser { 316 if !readingField && len(fieldParts) == 0 { 317 readingField = true 318 continue 319 } 320 gotEncloser = true 321 } 322 fieldParts = append(fieldParts, c) 323 } 324 325 if err := addField(); err != nil { 326 return err 327 } 328 329 if datumIdx != len(conv.VisibleCols) { 330 return newImportRowError(fmt.Errorf( 331 "unexpected number of columns, expected %d got %d", len(conv.VisibleCols), datumIdx), 332 string(data), rowNum) 333 } 334 335 return nil 336 } 337 338 func (d *mysqloutfileReader) readFile( 339 ctx context.Context, input *fileReader, inputIdx int32, resumePos int64, rejected chan string, 340 ) error { 341 producer := &delimitedProducer{ 342 importCtx: d.importCtx, 343 opts: &d.opts, 344 input: input, 345 reader: bufio.NewReaderSize(input, 64*1024), 346 } 347 consumer := &delimitedConsumer{opts: &d.opts} 348 349 if resumePos < int64(d.opts.Skip) { 350 resumePos = int64(d.opts.Skip) 351 } 352 353 fileCtx := &importFileContext{ 354 source: inputIdx, 355 skip: resumePos, 356 rejected: rejected, 357 } 358 359 return runParallelImport(ctx, d.importCtx, fileCtx, producer, consumer) 360 }