github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/import_processor.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package importccl 10 11 import ( 12 "context" 13 "math" 14 "sync/atomic" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/ccl/storageccl" 18 "github.com/cockroachdb/cockroach/pkg/keys" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 22 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 23 "github.com/cockroachdb/cockroach/pkg/sql/row" 24 "github.com/cockroachdb/cockroach/pkg/sql/rowexec" 25 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 26 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 27 "github.com/cockroachdb/cockroach/pkg/sql/types" 28 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 29 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 32 "github.com/cockroachdb/cockroach/pkg/util/tracing" 33 "github.com/cockroachdb/errors" 34 ) 35 36 var csvOutputTypes = []*types.T{ 37 types.Bytes, 38 types.Bytes, 39 } 40 41 type readImportDataProcessor struct { 42 flowCtx *execinfra.FlowCtx 43 spec execinfrapb.ReadImportDataSpec 44 output execinfra.RowReceiver 45 } 46 47 var _ execinfra.Processor = &readImportDataProcessor{} 48 49 func (cp *readImportDataProcessor) OutputTypes() []*types.T { 50 return csvOutputTypes 51 } 52 53 func newReadImportDataProcessor( 54 flowCtx *execinfra.FlowCtx, 55 processorID int32, 56 spec execinfrapb.ReadImportDataSpec, 57 output execinfra.RowReceiver, 58 ) (execinfra.Processor, error) { 59 cp := &readImportDataProcessor{ 60 flowCtx: flowCtx, 61 spec: spec, 62 output: output, 63 } 64 return cp, nil 65 } 66 67 func (cp *readImportDataProcessor) Run(ctx context.Context) { 68 ctx, span := tracing.ChildSpan(ctx, "readImportDataProcessor") 69 defer tracing.FinishSpan(span) 70 defer cp.output.ProducerDone() 71 72 progCh := make(chan execinfrapb.RemoteProducerMetadata_BulkProcessorProgress) 73 74 var summary *roachpb.BulkOpSummary 75 var err error 76 // We don't have to worry about this go routine leaking because next we loop over progCh 77 // which is closed only after the go routine returns. 78 go func() { 79 defer close(progCh) 80 summary, err = runImport(ctx, cp.flowCtx, &cp.spec, progCh) 81 }() 82 83 for prog := range progCh { 84 // Take a copy so that we can send the progress address to the output processor. 85 p := prog 86 cp.output.Push(nil, &execinfrapb.ProducerMetadata{BulkProcessorProgress: &p}) 87 } 88 89 if err != nil { 90 cp.output.Push(nil, &execinfrapb.ProducerMetadata{Err: err}) 91 return 92 } 93 94 // Once the import is done, send back to the controller the serialized 95 // summary of the import operation. For more info see roachpb.BulkOpSummary. 96 countsBytes, err := protoutil.Marshal(summary) 97 if err != nil { 98 cp.output.Push(nil, &execinfrapb.ProducerMetadata{Err: err}) 99 return 100 } 101 cp.output.Push(sqlbase.EncDatumRow{ 102 sqlbase.DatumToEncDatum(types.Bytes, tree.NewDBytes(tree.DBytes(countsBytes))), 103 sqlbase.DatumToEncDatum(types.Bytes, tree.NewDBytes(tree.DBytes([]byte{}))), 104 }, nil) 105 } 106 107 func makeInputConverter( 108 ctx context.Context, 109 spec *execinfrapb.ReadImportDataSpec, 110 evalCtx *tree.EvalContext, 111 kvCh chan row.KVBatch, 112 ) (inputConverter, error) { 113 114 var singleTable *sqlbase.TableDescriptor 115 var singleTableTargetCols tree.NameList 116 if len(spec.Tables) == 1 { 117 for _, table := range spec.Tables { 118 singleTable = table.Desc 119 singleTableTargetCols = make(tree.NameList, len(table.TargetCols)) 120 for i, colName := range table.TargetCols { 121 singleTableTargetCols[i] = tree.Name(colName) 122 } 123 } 124 } 125 126 if format := spec.Format.Format; singleTable == nil && !isMultiTableFormat(format) { 127 return nil, errors.Errorf("%s only supports reading a single, pre-specified table", format.String()) 128 } 129 130 switch spec.Format.Format { 131 case roachpb.IOFileFormat_CSV: 132 isWorkload := true 133 for _, file := range spec.Uri { 134 if conf, err := cloud.ExternalStorageConfFromURI(file); err != nil || conf.Provider != roachpb.ExternalStorageProvider_Workload { 135 isWorkload = false 136 break 137 } 138 } 139 if isWorkload { 140 return newWorkloadReader(kvCh, singleTable, evalCtx), nil 141 } 142 return newCSVInputReader( 143 kvCh, spec.Format.Csv, spec.WalltimeNanos, int(spec.ReaderParallelism), 144 singleTable, singleTableTargetCols, evalCtx), nil 145 case roachpb.IOFileFormat_MysqlOutfile: 146 return newMysqloutfileReader( 147 spec.Format.MysqlOut, kvCh, spec.WalltimeNanos, int(spec.ReaderParallelism), singleTable, evalCtx) 148 case roachpb.IOFileFormat_Mysqldump: 149 return newMysqldumpReader(ctx, kvCh, spec.Tables, evalCtx) 150 case roachpb.IOFileFormat_PgCopy: 151 return newPgCopyReader(ctx, kvCh, spec.Format.PgCopy, singleTable, evalCtx) 152 case roachpb.IOFileFormat_PgDump: 153 return newPgDumpReader(ctx, kvCh, spec.Format.PgDump, spec.Tables, evalCtx) 154 case roachpb.IOFileFormat_Avro: 155 return newAvroInputReader( 156 kvCh, singleTable, spec.Format.Avro, spec.WalltimeNanos, 157 int(spec.ReaderParallelism), evalCtx) 158 default: 159 return nil, errors.Errorf( 160 "Requested IMPORT format (%d) not supported by this node", spec.Format.Format) 161 } 162 } 163 164 // ingestKvs drains kvs from the channel until it closes, ingesting them using 165 // the BulkAdder. It handles the required buffering/sorting/etc. 166 func ingestKvs( 167 ctx context.Context, 168 flowCtx *execinfra.FlowCtx, 169 spec *execinfrapb.ReadImportDataSpec, 170 progCh chan execinfrapb.RemoteProducerMetadata_BulkProcessorProgress, 171 kvCh <-chan row.KVBatch, 172 ) (*roachpb.BulkOpSummary, error) { 173 ctx, span := tracing.ChildSpan(ctx, "ingestKVs") 174 defer tracing.FinishSpan(span) 175 176 writeTS := hlc.Timestamp{WallTime: spec.WalltimeNanos} 177 178 flushSize := func() int64 { return storageccl.MaxImportBatchSize(flowCtx.Cfg.Settings) } 179 180 // We create two bulk adders so as to combat the excessive flushing of small 181 // SSTs which was observed when using a single adder for both primary and 182 // secondary index kvs. The number of secondary index kvs are small, and so we 183 // expect the indexAdder to flush much less frequently than the pkIndexAdder. 184 // 185 // It is highly recommended that the cluster setting controlling the max size 186 // of the pkIndexAdder buffer be set below that of the indexAdder buffer. 187 // Otherwise, as a consequence of filling up faster the pkIndexAdder buffer 188 // will hog memory as it tries to grow more aggressively. 189 minBufferSize, maxBufferSize, stepSize := storageccl.ImportBufferConfigSizes(flowCtx.Cfg.Settings, true /* isPKAdder */) 190 pkIndexAdder, err := flowCtx.Cfg.BulkAdder(ctx, flowCtx.Cfg.DB, writeTS, kvserverbase.BulkAdderOptions{ 191 Name: "pkAdder", 192 DisallowShadowing: true, 193 SkipDuplicates: true, 194 MinBufferSize: minBufferSize, 195 MaxBufferSize: maxBufferSize, 196 StepBufferSize: stepSize, 197 SSTSize: flushSize, 198 }) 199 if err != nil { 200 return nil, err 201 } 202 defer pkIndexAdder.Close(ctx) 203 204 minBufferSize, maxBufferSize, stepSize = storageccl.ImportBufferConfigSizes(flowCtx.Cfg.Settings, false /* isPKAdder */) 205 indexAdder, err := flowCtx.Cfg.BulkAdder(ctx, flowCtx.Cfg.DB, writeTS, kvserverbase.BulkAdderOptions{ 206 Name: "indexAdder", 207 DisallowShadowing: true, 208 SkipDuplicates: true, 209 MinBufferSize: minBufferSize, 210 MaxBufferSize: maxBufferSize, 211 StepBufferSize: stepSize, 212 SSTSize: flushSize, 213 }) 214 if err != nil { 215 return nil, err 216 } 217 defer indexAdder.Close(ctx) 218 219 // Setup progress tracking: 220 // - offsets maps source file IDs to offsets in the slices below. 221 // - writtenRow contains LastRow of batch most recently added to the buffer. 222 // - writtenFraction contains % of the input finished as of last batch. 223 // - pkFlushedRow contains `writtenRow` as of the last pk adder flush. 224 // - idxFlushedRow contains `writtenRow` as of the last index adder flush. 225 // In pkFlushedRow, idxFlushedRow and writtenFaction values are written via 226 // `atomic` so the progress reporting go goroutine can read them. 227 writtenRow := make([]int64, len(spec.Uri)) 228 writtenFraction := make([]uint32, len(spec.Uri)) 229 230 pkFlushedRow := make([]int64, len(spec.Uri)) 231 idxFlushedRow := make([]int64, len(spec.Uri)) 232 233 // When the PK adder flushes, everything written has been flushed, so we set 234 // pkFlushedRow to writtenRow. Additionally if the indexAdder is empty then we 235 // can treat it as flushed as well (in case we're not adding anything to it). 236 pkIndexAdder.SetOnFlush(func() { 237 for i, emitted := range writtenRow { 238 atomic.StoreInt64(&pkFlushedRow[i], emitted) 239 } 240 if indexAdder.IsEmpty() { 241 for i, emitted := range writtenRow { 242 atomic.StoreInt64(&idxFlushedRow[i], emitted) 243 } 244 } 245 }) 246 indexAdder.SetOnFlush(func() { 247 for i, emitted := range writtenRow { 248 atomic.StoreInt64(&idxFlushedRow[i], emitted) 249 } 250 }) 251 252 // offsets maps input file ID to a slot in our progress tracking slices. 253 offsets := make(map[int32]int, len(spec.Uri)) 254 var offset int 255 for i := range spec.Uri { 256 offsets[i] = offset 257 offset++ 258 } 259 260 pushProgress := func() { 261 var prog execinfrapb.RemoteProducerMetadata_BulkProcessorProgress 262 prog.ResumePos = make(map[int32]int64) 263 prog.CompletedFraction = make(map[int32]float32) 264 for file, offset := range offsets { 265 pk := atomic.LoadInt64(&pkFlushedRow[offset]) 266 idx := atomic.LoadInt64(&idxFlushedRow[offset]) 267 // On resume we'll be able to skip up the last row for which both the 268 // PK and index adders have flushed KVs. 269 if idx > pk { 270 prog.ResumePos[file] = pk 271 } else { 272 prog.ResumePos[file] = idx 273 } 274 prog.CompletedFraction[file] = math.Float32frombits(atomic.LoadUint32(&writtenFraction[offset])) 275 } 276 progCh <- prog 277 } 278 279 // stopProgress will be closed when there is no more progress to report. 280 stopProgress := make(chan struct{}) 281 g := ctxgroup.WithContext(ctx) 282 g.GoCtx(func(ctx context.Context) error { 283 tick := time.NewTicker(time.Second * 10) 284 defer tick.Stop() 285 done := ctx.Done() 286 for { 287 select { 288 case <-done: 289 return ctx.Err() 290 case <-stopProgress: 291 return nil 292 case <-tick.C: 293 pushProgress() 294 } 295 } 296 }) 297 298 g.GoCtx(func(ctx context.Context) error { 299 defer close(stopProgress) 300 301 // We insert splits at every index span of the table above. Since the 302 // BulkAdder is split aware when constructing SSTs, there is no risk of worst 303 // case overlap behavior in the resulting AddSSTable calls. 304 // 305 // NB: We are getting rid of the pre-buffering stage which constructed 306 // separate buckets for each table's primary data, and flushed to the 307 // BulkAdder when the bucket was full. This is because, a tpcc 1k IMPORT would 308 // OOM when maintaining this buffer. Two big wins we got from this 309 // pre-buffering stage were: 310 // 311 // 1. We avoided worst case overlapping behavior in the AddSSTable calls as a 312 // result of flushing keys with the same TableIDIndexID prefix, together. 313 // 314 // 2. Secondary index KVs which were few and filled the bucket infrequently 315 // were flushed rarely, resulting in fewer L0 (and total) files. 316 // 317 // While we continue to achieve the first property as a result of the splits 318 // mentioned above, the KVs sent to the BulkAdder are no longer grouped which 319 // results in flushing a much larger number of small SSTs. This increases the 320 // number of L0 (and total) files, but with a lower memory usage. 321 for kvBatch := range kvCh { 322 for _, kv := range kvBatch.KVs { 323 _, _, indexID, indexErr := keys.TODOSQLCodec.DecodeIndexPrefix(kv.Key) 324 if indexErr != nil { 325 return indexErr 326 } 327 328 // Decide which adder to send the KV to by extracting its index id. 329 // 330 // TODO(adityamaru): There is a potential optimization of plumbing the 331 // different putters, and differentiating based on their type. It might be 332 // more efficient than parsing every kv. 333 if indexID == 1 { 334 if err := pkIndexAdder.Add(ctx, kv.Key, kv.Value.RawBytes); err != nil { 335 if errors.HasType(err, (*kvserverbase.DuplicateKeyError)(nil)) { 336 return errors.Wrap(err, "duplicate key in primary index") 337 } 338 return err 339 } 340 } else { 341 if err := indexAdder.Add(ctx, kv.Key, kv.Value.RawBytes); err != nil { 342 if errors.HasType(err, (*kvserverbase.DuplicateKeyError)(nil)) { 343 return errors.Wrap(err, "duplicate key in index") 344 } 345 return err 346 } 347 } 348 } 349 offset := offsets[kvBatch.Source] 350 writtenRow[offset] = kvBatch.LastRow 351 atomic.StoreUint32(&writtenFraction[offset], math.Float32bits(kvBatch.Progress)) 352 if flowCtx.Cfg.TestingKnobs.BulkAdderFlushesEveryBatch { 353 _ = pkIndexAdder.Flush(ctx) 354 _ = indexAdder.Flush(ctx) 355 pushProgress() 356 } 357 } 358 return nil 359 }) 360 361 if err := g.Wait(); err != nil { 362 return nil, err 363 } 364 365 if err := pkIndexAdder.Flush(ctx); err != nil { 366 if errors.HasType(err, (*kvserverbase.DuplicateKeyError)(nil)) { 367 return nil, errors.Wrap(err, "duplicate key in primary index") 368 } 369 return nil, err 370 } 371 372 if err := indexAdder.Flush(ctx); err != nil { 373 if errors.HasType(err, (*kvserverbase.DuplicateKeyError)(nil)) { 374 return nil, errors.Wrap(err, "duplicate key in index") 375 } 376 return nil, err 377 } 378 379 addedSummary := pkIndexAdder.GetSummary() 380 addedSummary.Add(indexAdder.GetSummary()) 381 return &addedSummary, nil 382 } 383 384 func init() { 385 rowexec.NewReadImportDataProcessor = newReadImportDataProcessor 386 }