github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/read_import_csv.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package importccl 10 11 import ( 12 "context" 13 "io" 14 "strings" 15 16 "github.com/cockroachdb/cockroach/pkg/roachpb" 17 "github.com/cockroachdb/cockroach/pkg/sql/row" 18 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 19 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 20 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 21 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 22 "github.com/cockroachdb/cockroach/pkg/util/encoding/csv" 23 "github.com/cockroachdb/errors" 24 ) 25 26 type csvInputReader struct { 27 importCtx *parallelImportContext 28 opts roachpb.CSVOptions 29 } 30 31 var _ inputConverter = &csvInputReader{} 32 33 func newCSVInputReader( 34 kvCh chan row.KVBatch, 35 opts roachpb.CSVOptions, 36 walltime int64, 37 parallelism int, 38 tableDesc *sqlbase.TableDescriptor, 39 targetCols tree.NameList, 40 evalCtx *tree.EvalContext, 41 ) *csvInputReader { 42 return &csvInputReader{ 43 importCtx: ¶llelImportContext{ 44 walltime: walltime, 45 numWorkers: parallelism, 46 evalCtx: evalCtx, 47 tableDesc: tableDesc, 48 targetCols: targetCols, 49 kvCh: kvCh, 50 }, 51 opts: opts, 52 } 53 } 54 55 func (c *csvInputReader) start(group ctxgroup.Group) { 56 } 57 58 func (c *csvInputReader) readFiles( 59 ctx context.Context, 60 dataFiles map[int32]string, 61 resumePos map[int32]int64, 62 format roachpb.IOFileFormat, 63 makeExternalStorage cloud.ExternalStorageFactory, 64 ) error { 65 return readInputFiles(ctx, dataFiles, resumePos, format, c.readFile, makeExternalStorage) 66 } 67 68 func (c *csvInputReader) readFile( 69 ctx context.Context, input *fileReader, inputIdx int32, resumePos int64, rejected chan string, 70 ) error { 71 producer, consumer := newCSVPipeline(c, input) 72 73 if resumePos < int64(c.opts.Skip) { 74 resumePos = int64(c.opts.Skip) 75 } 76 77 fileCtx := &importFileContext{ 78 source: inputIdx, 79 skip: resumePos, 80 rejected: rejected, 81 } 82 83 return runParallelImport(ctx, c.importCtx, fileCtx, producer, consumer) 84 } 85 86 type csvRowProducer struct { 87 importCtx *parallelImportContext 88 opts *roachpb.CSVOptions 89 csv *csv.Reader 90 rowNum int64 91 err error 92 record []string 93 progress func() float32 94 expectedColumns tree.NameList 95 } 96 97 var _ importRowProducer = &csvRowProducer{} 98 99 // Scan() implements importRowProducer interface. 100 func (p *csvRowProducer) Scan() bool { 101 p.record, p.err = p.csv.Read() 102 103 if p.err == io.EOF { 104 p.err = nil 105 return false 106 } 107 108 return p.err == nil 109 } 110 111 // Err() implements importRowProducer interface. 112 func (p *csvRowProducer) Err() error { 113 return p.err 114 } 115 116 // Skip() implements importRowProducer interface. 117 func (p *csvRowProducer) Skip() error { 118 // No-op 119 return nil 120 } 121 122 func strRecord(record []string, sep rune) string { 123 csvSep := "," 124 if sep != 0 { 125 csvSep = string(sep) 126 } 127 return strings.Join(record, csvSep) 128 } 129 130 // Row() implements importRowProducer interface. 131 func (p *csvRowProducer) Row() (interface{}, error) { 132 p.rowNum++ 133 expectedColsLen := len(p.expectedColumns) 134 if expectedColsLen == 0 { 135 expectedColsLen = len(p.importCtx.tableDesc.VisibleColumns()) 136 } 137 138 if len(p.record) == expectedColsLen { 139 // Expected number of columns. 140 } else if len(p.record) == expectedColsLen+1 && p.record[expectedColsLen] == "" { 141 // Line has the optional trailing comma, ignore the empty field. 142 p.record = p.record[:expectedColsLen] 143 } else { 144 return nil, newImportRowError( 145 errors.Errorf("expected %d fields, got %d", expectedColsLen, len(p.record)), 146 strRecord(p.record, p.opts.Comma), 147 p.rowNum) 148 149 } 150 return p.record, nil 151 } 152 153 // Progress() implements importRowProducer interface. 154 func (p *csvRowProducer) Progress() float32 { 155 return p.progress() 156 } 157 158 type csvRowConsumer struct { 159 importCtx *parallelImportContext 160 opts *roachpb.CSVOptions 161 } 162 163 var _ importRowConsumer = &csvRowConsumer{} 164 165 // FillDatums() implements importRowConsumer interface 166 func (c *csvRowConsumer) FillDatums( 167 row interface{}, rowNum int64, conv *row.DatumRowConverter, 168 ) error { 169 record := row.([]string) 170 datumIdx := 0 171 172 for i, field := range record { 173 // Skip over record entries corresponding to columns not in the target 174 // columns specified by the user. 175 if _, ok := conv.IsTargetCol[i]; !ok { 176 continue 177 } 178 179 if c.opts.NullEncoding != nil && 180 field == *c.opts.NullEncoding { 181 conv.Datums[datumIdx] = tree.DNull 182 } else { 183 var err error 184 conv.Datums[datumIdx], err = sqlbase.ParseDatumStringAs(conv.VisibleColTypes[i], field, conv.EvalCtx) 185 if err != nil { 186 col := conv.VisibleCols[i] 187 return newImportRowError( 188 errors.Wrapf(err, "parse %q as %s", col.Name, col.Type.SQLString()), 189 strRecord(record, c.opts.Comma), 190 rowNum) 191 } 192 } 193 datumIdx++ 194 } 195 return nil 196 } 197 198 func newCSVPipeline(c *csvInputReader, input *fileReader) (*csvRowProducer, *csvRowConsumer) { 199 cr := csv.NewReader(input) 200 if c.opts.Comma != 0 { 201 cr.Comma = c.opts.Comma 202 } 203 cr.FieldsPerRecord = -1 204 cr.LazyQuotes = !c.opts.StrictQuotes 205 cr.Comment = c.opts.Comment 206 207 producer := &csvRowProducer{ 208 importCtx: c.importCtx, 209 opts: &c.opts, 210 csv: cr, 211 progress: func() float32 { return input.ReadFraction() }, 212 expectedColumns: c.importCtx.targetCols, 213 } 214 consumer := &csvRowConsumer{ 215 importCtx: c.importCtx, 216 opts: &c.opts, 217 } 218 219 return producer, consumer 220 }