github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/read_import_workload.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package importccl 10 11 import ( 12 "context" 13 "net/url" 14 "runtime" 15 "strings" 16 "sync/atomic" 17 "unsafe" 18 19 "github.com/cockroachdb/apd" 20 "github.com/cockroachdb/cockroach/pkg/col/coldata" 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/sql/row" 23 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 24 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 25 "github.com/cockroachdb/cockroach/pkg/sql/types" 26 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 27 "github.com/cockroachdb/cockroach/pkg/util/bufalloc" 28 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 29 "github.com/cockroachdb/cockroach/pkg/util/timeutil/pgdate" 30 "github.com/cockroachdb/cockroach/pkg/workload" 31 "github.com/cockroachdb/errors" 32 ) 33 34 type workloadReader struct { 35 evalCtx *tree.EvalContext 36 table *sqlbase.TableDescriptor 37 kvCh chan row.KVBatch 38 } 39 40 var _ inputConverter = &workloadReader{} 41 42 func newWorkloadReader( 43 kvCh chan row.KVBatch, table *sqlbase.TableDescriptor, evalCtx *tree.EvalContext, 44 ) *workloadReader { 45 return &workloadReader{evalCtx: evalCtx, table: table, kvCh: kvCh} 46 } 47 48 func (w *workloadReader) start(ctx ctxgroup.Group) { 49 } 50 51 // makeDatumFromColOffset tries to fast-path a few workload-generated types into 52 // directly datums, to dodge making a string and then the parsing it. 53 func makeDatumFromColOffset( 54 alloc *sqlbase.DatumAlloc, hint *types.T, evalCtx *tree.EvalContext, col coldata.Vec, rowIdx int, 55 ) (tree.Datum, error) { 56 if col.Nulls().NullAt(rowIdx) { 57 return tree.DNull, nil 58 } 59 switch t := col.Type(); col.CanonicalTypeFamily() { 60 case types.BoolFamily: 61 return tree.MakeDBool(tree.DBool(col.Bool()[rowIdx])), nil 62 case types.IntFamily: 63 switch t.Width() { 64 case 0, 64: 65 switch hint.Family() { 66 case types.IntFamily: 67 return alloc.NewDInt(tree.DInt(col.Int64()[rowIdx])), nil 68 case types.DecimalFamily: 69 d := *apd.New(col.Int64()[rowIdx], 0) 70 return alloc.NewDDecimal(tree.DDecimal{Decimal: d}), nil 71 case types.DateFamily: 72 date, err := pgdate.MakeDateFromUnixEpoch(col.Int64()[rowIdx]) 73 if err != nil { 74 return nil, err 75 } 76 return alloc.NewDDate(tree.DDate{Date: date}), nil 77 } 78 case 16: 79 switch hint.Family() { 80 case types.IntFamily: 81 return alloc.NewDInt(tree.DInt(col.Int16()[rowIdx])), nil 82 } 83 } 84 case types.FloatFamily: 85 switch hint.Family() { 86 case types.FloatFamily: 87 return alloc.NewDFloat(tree.DFloat(col.Float64()[rowIdx])), nil 88 case types.DecimalFamily: 89 var d apd.Decimal 90 if _, err := d.SetFloat64(col.Float64()[rowIdx]); err != nil { 91 return nil, err 92 } 93 return alloc.NewDDecimal(tree.DDecimal{Decimal: d}), nil 94 } 95 case types.BytesFamily: 96 switch hint.Family() { 97 case types.BytesFamily: 98 return alloc.NewDBytes(tree.DBytes(col.Bytes().Get(rowIdx))), nil 99 case types.StringFamily: 100 data := col.Bytes().Get(rowIdx) 101 str := *(*string)(unsafe.Pointer(&data)) 102 return alloc.NewDString(tree.DString(str)), nil 103 default: 104 data := col.Bytes().Get(rowIdx) 105 str := *(*string)(unsafe.Pointer(&data)) 106 return sqlbase.ParseDatumStringAs(hint, str, evalCtx) 107 } 108 } 109 return nil, errors.Errorf( 110 `don't know how to interpret %s column as %s`, col.Type(), hint) 111 } 112 113 func (w *workloadReader) readFiles( 114 ctx context.Context, 115 dataFiles map[int32]string, 116 _ map[int32]int64, 117 _ roachpb.IOFileFormat, 118 _ cloud.ExternalStorageFactory, 119 ) error { 120 121 wcs := make([]*WorkloadKVConverter, 0, len(dataFiles)) 122 for fileID, fileName := range dataFiles { 123 file, err := url.Parse(fileName) 124 if err != nil { 125 return err 126 } 127 conf, err := cloud.ParseWorkloadConfig(file) 128 if err != nil { 129 return err 130 } 131 meta, err := workload.Get(conf.Generator) 132 if err != nil { 133 return err 134 } 135 // Different versions of the workload could generate different data, so 136 // disallow this. 137 if meta.Version != conf.Version { 138 return errors.Errorf( 139 `expected %s version "%s" but got "%s"`, meta.Name, conf.Version, meta.Version) 140 } 141 gen := meta.New() 142 if f, ok := gen.(workload.Flagser); ok { 143 flags := f.Flags() 144 if err := flags.Parse(conf.Flags); err != nil { 145 return errors.Wrapf(err, `parsing parameters %s`, strings.Join(conf.Flags, ` `)) 146 } 147 } 148 var t workload.Table 149 for _, tbl := range gen.Tables() { 150 if tbl.Name == conf.Table { 151 t = tbl 152 break 153 } 154 } 155 if t.Name == `` { 156 return errors.Wrapf(err, `unknown table %s for generator %s`, conf.Table, meta.Name) 157 } 158 159 wc := NewWorkloadKVConverter( 160 fileID, w.table, t.InitialRows, int(conf.BatchBegin), int(conf.BatchEnd), w.kvCh) 161 wcs = append(wcs, wc) 162 } 163 164 for _, wc := range wcs { 165 if err := ctxgroup.GroupWorkers(ctx, runtime.NumCPU(), func(ctx context.Context, _ int) error { 166 evalCtx := w.evalCtx.Copy() 167 return wc.Worker(ctx, evalCtx) 168 }); err != nil { 169 return err 170 } 171 } 172 return nil 173 } 174 175 // WorkloadKVConverter converts workload.BatchedTuples to []roachpb.KeyValues. 176 type WorkloadKVConverter struct { 177 tableDesc *sqlbase.TableDescriptor 178 rows workload.BatchedTuples 179 batchIdxAtomic int64 180 batchEnd int 181 kvCh chan row.KVBatch 182 183 // For progress reporting 184 fileID int32 185 totalBatches float32 186 finishedBatchesAtomic int64 187 } 188 189 // NewWorkloadKVConverter returns a WorkloadKVConverter for the given table and 190 // range of batches, emitted converted kvs to the given channel. 191 func NewWorkloadKVConverter( 192 fileID int32, 193 tableDesc *sqlbase.TableDescriptor, 194 rows workload.BatchedTuples, 195 batchStart, batchEnd int, 196 kvCh chan row.KVBatch, 197 ) *WorkloadKVConverter { 198 return &WorkloadKVConverter{ 199 tableDesc: tableDesc, 200 rows: rows, 201 batchIdxAtomic: int64(batchStart) - 1, 202 batchEnd: batchEnd, 203 kvCh: kvCh, 204 totalBatches: float32(batchEnd - batchStart), 205 fileID: fileID, 206 } 207 } 208 209 // Worker can be called concurrently to create multiple workers to process 210 // batches in order. This keeps concurrently running workers ~adjacent batches 211 // at any given moment (as opposed to handing large ranges of batches to each 212 // worker, e.g. 0-999 to worker 1, 1000-1999 to worker 2, etc). This property is 213 // relevant when ordered workload batches produce ordered PK data, since the 214 // workers feed into a shared kvCH so then contiguous blocks of PK data will 215 // usually be buffered together and thus batched together in the SST builder, 216 // minimzing the amount of overlapping SSTs ingested. 217 // 218 // This worker needs its own EvalContext and DatumAlloc. 219 func (w *WorkloadKVConverter) Worker(ctx context.Context, evalCtx *tree.EvalContext) error { 220 conv, err := row.NewDatumRowConverter(ctx, w.tableDesc, nil /* targetColNames */, evalCtx, w.kvCh) 221 if err != nil { 222 return err 223 } 224 conv.KvBatch.Source = w.fileID 225 conv.FractionFn = func() float32 { 226 return float32(atomic.LoadInt64(&w.finishedBatchesAtomic)) / w.totalBatches 227 } 228 var alloc sqlbase.DatumAlloc 229 var a bufalloc.ByteAllocator 230 cb := coldata.NewMemBatchWithSize(nil /* types */, 0 /* size */, coldata.StandardColumnFactory) 231 232 for { 233 batchIdx := int(atomic.AddInt64(&w.batchIdxAtomic, 1)) 234 if batchIdx >= w.batchEnd { 235 break 236 } 237 a = a[:0] 238 w.rows.FillBatch(batchIdx, cb, &a) 239 for rowIdx, numRows := 0, cb.Length(); rowIdx < numRows; rowIdx++ { 240 for colIdx, col := range cb.ColVecs() { 241 // TODO(dan): This does a type switch once per-datum. Reduce this to 242 // a one-time switch per column. 243 converted, err := makeDatumFromColOffset( 244 &alloc, conv.VisibleColTypes[colIdx], evalCtx, col, rowIdx) 245 if err != nil { 246 return err 247 } 248 conv.Datums[colIdx] = converted 249 } 250 // `conv.Row` uses these as arguments to GenerateUniqueID to generate 251 // hidden primary keys, when necessary. We want them to be ascending per 252 // batch (to reduce overlap in the resulting kvs) and non-conflicting 253 // (because of primary key uniqueness). The ids that come out of 254 // GenerateUniqueID are sorted by (fileIdx, timestamp) and unique as long 255 // as the two inputs are a unique combo, so using the index of the batch 256 // within the table and the index of the row within the batch should do 257 // what we want. 258 fileIdx, timestamp := int32(batchIdx), int64(rowIdx) 259 if err := conv.Row(ctx, fileIdx, timestamp); err != nil { 260 return err 261 } 262 } 263 atomic.AddInt64(&w.finishedBatchesAtomic, 1) 264 } 265 return conv.SendBatch(ctx) 266 }