github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/read_import_workload.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package importccl
    10  
    11  import (
    12  	"context"
    13  	"net/url"
    14  	"runtime"
    15  	"strings"
    16  	"sync/atomic"
    17  	"unsafe"
    18  
    19  	"github.com/cockroachdb/apd"
    20  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    21  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    26  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    27  	"github.com/cockroachdb/cockroach/pkg/util/bufalloc"
    28  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    29  	"github.com/cockroachdb/cockroach/pkg/util/timeutil/pgdate"
    30  	"github.com/cockroachdb/cockroach/pkg/workload"
    31  	"github.com/cockroachdb/errors"
    32  )
    33  
    34  type workloadReader struct {
    35  	evalCtx *tree.EvalContext
    36  	table   *sqlbase.TableDescriptor
    37  	kvCh    chan row.KVBatch
    38  }
    39  
    40  var _ inputConverter = &workloadReader{}
    41  
    42  func newWorkloadReader(
    43  	kvCh chan row.KVBatch, table *sqlbase.TableDescriptor, evalCtx *tree.EvalContext,
    44  ) *workloadReader {
    45  	return &workloadReader{evalCtx: evalCtx, table: table, kvCh: kvCh}
    46  }
    47  
    48  func (w *workloadReader) start(ctx ctxgroup.Group) {
    49  }
    50  
    51  // makeDatumFromColOffset tries to fast-path a few workload-generated types into
    52  // directly datums, to dodge making a string and then the parsing it.
    53  func makeDatumFromColOffset(
    54  	alloc *sqlbase.DatumAlloc, hint *types.T, evalCtx *tree.EvalContext, col coldata.Vec, rowIdx int,
    55  ) (tree.Datum, error) {
    56  	if col.Nulls().NullAt(rowIdx) {
    57  		return tree.DNull, nil
    58  	}
    59  	switch t := col.Type(); col.CanonicalTypeFamily() {
    60  	case types.BoolFamily:
    61  		return tree.MakeDBool(tree.DBool(col.Bool()[rowIdx])), nil
    62  	case types.IntFamily:
    63  		switch t.Width() {
    64  		case 0, 64:
    65  			switch hint.Family() {
    66  			case types.IntFamily:
    67  				return alloc.NewDInt(tree.DInt(col.Int64()[rowIdx])), nil
    68  			case types.DecimalFamily:
    69  				d := *apd.New(col.Int64()[rowIdx], 0)
    70  				return alloc.NewDDecimal(tree.DDecimal{Decimal: d}), nil
    71  			case types.DateFamily:
    72  				date, err := pgdate.MakeDateFromUnixEpoch(col.Int64()[rowIdx])
    73  				if err != nil {
    74  					return nil, err
    75  				}
    76  				return alloc.NewDDate(tree.DDate{Date: date}), nil
    77  			}
    78  		case 16:
    79  			switch hint.Family() {
    80  			case types.IntFamily:
    81  				return alloc.NewDInt(tree.DInt(col.Int16()[rowIdx])), nil
    82  			}
    83  		}
    84  	case types.FloatFamily:
    85  		switch hint.Family() {
    86  		case types.FloatFamily:
    87  			return alloc.NewDFloat(tree.DFloat(col.Float64()[rowIdx])), nil
    88  		case types.DecimalFamily:
    89  			var d apd.Decimal
    90  			if _, err := d.SetFloat64(col.Float64()[rowIdx]); err != nil {
    91  				return nil, err
    92  			}
    93  			return alloc.NewDDecimal(tree.DDecimal{Decimal: d}), nil
    94  		}
    95  	case types.BytesFamily:
    96  		switch hint.Family() {
    97  		case types.BytesFamily:
    98  			return alloc.NewDBytes(tree.DBytes(col.Bytes().Get(rowIdx))), nil
    99  		case types.StringFamily:
   100  			data := col.Bytes().Get(rowIdx)
   101  			str := *(*string)(unsafe.Pointer(&data))
   102  			return alloc.NewDString(tree.DString(str)), nil
   103  		default:
   104  			data := col.Bytes().Get(rowIdx)
   105  			str := *(*string)(unsafe.Pointer(&data))
   106  			return sqlbase.ParseDatumStringAs(hint, str, evalCtx)
   107  		}
   108  	}
   109  	return nil, errors.Errorf(
   110  		`don't know how to interpret %s column as %s`, col.Type(), hint)
   111  }
   112  
   113  func (w *workloadReader) readFiles(
   114  	ctx context.Context,
   115  	dataFiles map[int32]string,
   116  	_ map[int32]int64,
   117  	_ roachpb.IOFileFormat,
   118  	_ cloud.ExternalStorageFactory,
   119  ) error {
   120  
   121  	wcs := make([]*WorkloadKVConverter, 0, len(dataFiles))
   122  	for fileID, fileName := range dataFiles {
   123  		file, err := url.Parse(fileName)
   124  		if err != nil {
   125  			return err
   126  		}
   127  		conf, err := cloud.ParseWorkloadConfig(file)
   128  		if err != nil {
   129  			return err
   130  		}
   131  		meta, err := workload.Get(conf.Generator)
   132  		if err != nil {
   133  			return err
   134  		}
   135  		// Different versions of the workload could generate different data, so
   136  		// disallow this.
   137  		if meta.Version != conf.Version {
   138  			return errors.Errorf(
   139  				`expected %s version "%s" but got "%s"`, meta.Name, conf.Version, meta.Version)
   140  		}
   141  		gen := meta.New()
   142  		if f, ok := gen.(workload.Flagser); ok {
   143  			flags := f.Flags()
   144  			if err := flags.Parse(conf.Flags); err != nil {
   145  				return errors.Wrapf(err, `parsing parameters %s`, strings.Join(conf.Flags, ` `))
   146  			}
   147  		}
   148  		var t workload.Table
   149  		for _, tbl := range gen.Tables() {
   150  			if tbl.Name == conf.Table {
   151  				t = tbl
   152  				break
   153  			}
   154  		}
   155  		if t.Name == `` {
   156  			return errors.Wrapf(err, `unknown table %s for generator %s`, conf.Table, meta.Name)
   157  		}
   158  
   159  		wc := NewWorkloadKVConverter(
   160  			fileID, w.table, t.InitialRows, int(conf.BatchBegin), int(conf.BatchEnd), w.kvCh)
   161  		wcs = append(wcs, wc)
   162  	}
   163  
   164  	for _, wc := range wcs {
   165  		if err := ctxgroup.GroupWorkers(ctx, runtime.NumCPU(), func(ctx context.Context, _ int) error {
   166  			evalCtx := w.evalCtx.Copy()
   167  			return wc.Worker(ctx, evalCtx)
   168  		}); err != nil {
   169  			return err
   170  		}
   171  	}
   172  	return nil
   173  }
   174  
   175  // WorkloadKVConverter converts workload.BatchedTuples to []roachpb.KeyValues.
   176  type WorkloadKVConverter struct {
   177  	tableDesc      *sqlbase.TableDescriptor
   178  	rows           workload.BatchedTuples
   179  	batchIdxAtomic int64
   180  	batchEnd       int
   181  	kvCh           chan row.KVBatch
   182  
   183  	// For progress reporting
   184  	fileID                int32
   185  	totalBatches          float32
   186  	finishedBatchesAtomic int64
   187  }
   188  
   189  // NewWorkloadKVConverter returns a WorkloadKVConverter for the given table and
   190  // range of batches, emitted converted kvs to the given channel.
   191  func NewWorkloadKVConverter(
   192  	fileID int32,
   193  	tableDesc *sqlbase.TableDescriptor,
   194  	rows workload.BatchedTuples,
   195  	batchStart, batchEnd int,
   196  	kvCh chan row.KVBatch,
   197  ) *WorkloadKVConverter {
   198  	return &WorkloadKVConverter{
   199  		tableDesc:      tableDesc,
   200  		rows:           rows,
   201  		batchIdxAtomic: int64(batchStart) - 1,
   202  		batchEnd:       batchEnd,
   203  		kvCh:           kvCh,
   204  		totalBatches:   float32(batchEnd - batchStart),
   205  		fileID:         fileID,
   206  	}
   207  }
   208  
   209  // Worker can be called concurrently to create multiple workers to process
   210  // batches in order. This keeps concurrently running workers ~adjacent batches
   211  // at any given moment (as opposed to handing large ranges of batches to each
   212  // worker, e.g. 0-999 to worker 1, 1000-1999 to worker 2, etc). This property is
   213  // relevant when ordered workload batches produce ordered PK data, since the
   214  // workers feed into a shared kvCH so then contiguous blocks of PK data will
   215  // usually be buffered together and thus batched together in the SST builder,
   216  // minimzing the amount of overlapping SSTs ingested.
   217  //
   218  // This worker needs its own EvalContext and DatumAlloc.
   219  func (w *WorkloadKVConverter) Worker(ctx context.Context, evalCtx *tree.EvalContext) error {
   220  	conv, err := row.NewDatumRowConverter(ctx, w.tableDesc, nil /* targetColNames */, evalCtx, w.kvCh)
   221  	if err != nil {
   222  		return err
   223  	}
   224  	conv.KvBatch.Source = w.fileID
   225  	conv.FractionFn = func() float32 {
   226  		return float32(atomic.LoadInt64(&w.finishedBatchesAtomic)) / w.totalBatches
   227  	}
   228  	var alloc sqlbase.DatumAlloc
   229  	var a bufalloc.ByteAllocator
   230  	cb := coldata.NewMemBatchWithSize(nil /* types */, 0 /* size */, coldata.StandardColumnFactory)
   231  
   232  	for {
   233  		batchIdx := int(atomic.AddInt64(&w.batchIdxAtomic, 1))
   234  		if batchIdx >= w.batchEnd {
   235  			break
   236  		}
   237  		a = a[:0]
   238  		w.rows.FillBatch(batchIdx, cb, &a)
   239  		for rowIdx, numRows := 0, cb.Length(); rowIdx < numRows; rowIdx++ {
   240  			for colIdx, col := range cb.ColVecs() {
   241  				// TODO(dan): This does a type switch once per-datum. Reduce this to
   242  				// a one-time switch per column.
   243  				converted, err := makeDatumFromColOffset(
   244  					&alloc, conv.VisibleColTypes[colIdx], evalCtx, col, rowIdx)
   245  				if err != nil {
   246  					return err
   247  				}
   248  				conv.Datums[colIdx] = converted
   249  			}
   250  			// `conv.Row` uses these as arguments to GenerateUniqueID to generate
   251  			// hidden primary keys, when necessary. We want them to be ascending per
   252  			// batch (to reduce overlap in the resulting kvs) and non-conflicting
   253  			// (because of primary key uniqueness). The ids that come out of
   254  			// GenerateUniqueID are sorted by (fileIdx, timestamp) and unique as long
   255  			// as the two inputs are a unique combo, so using the index of the batch
   256  			// within the table and the index of the row within the batch should do
   257  			// what we want.
   258  			fileIdx, timestamp := int32(batchIdx), int64(rowIdx)
   259  			if err := conv.Row(ctx, fileIdx, timestamp); err != nil {
   260  				return err
   261  			}
   262  		}
   263  		atomic.AddInt64(&w.finishedBatchesAtomic, 1)
   264  	}
   265  	return conv.SendBatch(ctx)
   266  }