github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/libraries/doltcore/mvdata/data_mover.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mvdata
    16  
    17  import (
    18  	"bufio"
    19  	"bytes"
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sync/atomic"
    24  
    25  	"github.com/dolthub/dolt/go/cmd/dolt/cli"
    26  	"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped/csv"
    27  
    28  	"github.com/dolthub/dolt/go/cmd/dolt/errhand"
    29  	"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
    30  	"github.com/dolthub/dolt/go/libraries/doltcore/env"
    31  	"github.com/dolthub/dolt/go/libraries/doltcore/env/actions"
    32  	"github.com/dolthub/dolt/go/libraries/doltcore/row"
    33  	"github.com/dolthub/dolt/go/libraries/doltcore/rowconv"
    34  	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
    35  	"github.com/dolthub/dolt/go/libraries/doltcore/sqle/sqlutil"
    36  	"github.com/dolthub/dolt/go/libraries/doltcore/table"
    37  	"github.com/dolthub/dolt/go/libraries/doltcore/table/pipeline"
    38  	"github.com/dolthub/dolt/go/libraries/utils/filesys"
    39  	"github.com/dolthub/dolt/go/libraries/utils/set"
    40  	"github.com/dolthub/dolt/go/store/types"
    41  )
    42  
    43  type CsvOptions struct {
    44  	Delim string
    45  }
    46  
    47  type XlsxOptions struct {
    48  	SheetName string
    49  }
    50  
    51  type JSONOptions struct {
    52  	TableName string
    53  	SchFile   string
    54  }
    55  
    56  type DataMoverOptions interface {
    57  	WritesToTable() bool
    58  	SrcName() string
    59  	DestName() string
    60  }
    61  
    62  type DataMoverCloser interface {
    63  	table.TableWriteCloser
    64  	Flush(context.Context) (*doltdb.RootValue, error)
    65  }
    66  
    67  type DataMover struct {
    68  	Rd         table.TableReadCloser
    69  	Transforms *pipeline.TransformCollection
    70  	Wr         table.TableWriteCloser
    71  	ContOnErr  bool
    72  }
    73  
    74  type DataMoverCreationErrType string
    75  
    76  const (
    77  	CreateReaderErr   DataMoverCreationErrType = "Create reader error"
    78  	NomsKindSchemaErr DataMoverCreationErrType = "Invalid schema error"
    79  	SchemaErr         DataMoverCreationErrType = "Schema error"
    80  	MappingErr        DataMoverCreationErrType = "Mapping error"
    81  	ReplacingErr      DataMoverCreationErrType = "Replacing error"
    82  	CreateMapperErr   DataMoverCreationErrType = "Mapper creation error"
    83  	CreateWriterErr   DataMoverCreationErrType = "Create writer error"
    84  	CreateSorterErr   DataMoverCreationErrType = "Create sorter error"
    85  )
    86  
    87  var ErrProvidedPkNotFound = errors.New("provided primary key not found")
    88  
    89  type DataMoverCreationError struct {
    90  	ErrType DataMoverCreationErrType
    91  	Cause   error
    92  }
    93  
    94  func (dmce *DataMoverCreationError) String() string {
    95  	return string(dmce.ErrType) + ": " + dmce.Cause.Error()
    96  }
    97  
    98  type GCTableWriteCloser interface {
    99  	table.TableWriteCloser
   100  	GC(ctx context.Context) error
   101  }
   102  
   103  // Move is the method that executes the pipeline which will move data from the pipeline's source DataLocation to it's
   104  // dest DataLocation.  It returns the number of bad rows encountered during import, and an error.
   105  func (imp *DataMover) Move(ctx context.Context, sch schema.Schema) (badRowCount int64, err error) {
   106  	defer imp.Rd.Close(ctx)
   107  	defer func() {
   108  		closeErr := imp.Wr.Close(ctx)
   109  		if err == nil {
   110  			err = closeErr
   111  		}
   112  
   113  		if err == nil {
   114  			if gcTWC, ok := imp.Wr.(GCTableWriteCloser); ok {
   115  				err = gcTWC.GC(ctx)
   116  			}
   117  		}
   118  	}()
   119  
   120  	var badCount int64
   121  	var rowErr error
   122  	var printStarted bool
   123  	var b bytes.Buffer
   124  	badRowCB := func(trf *pipeline.TransformRowFailure) (quit bool) {
   125  		if !imp.ContOnErr {
   126  			rowErr = trf
   127  			return true
   128  		}
   129  
   130  		if !printStarted {
   131  			cli.PrintErrln("The following rows were skipped:")
   132  			printStarted = true
   133  		}
   134  
   135  		r := pipeline.GetTransFailureRow(trf)
   136  
   137  		if r != nil {
   138  			err = writeBadRowToCli(ctx, r, sch, &b)
   139  			if err != nil {
   140  				return true
   141  			}
   142  		}
   143  
   144  		atomic.AddInt64(&badCount, 1)
   145  		return false
   146  	}
   147  
   148  	p := pipeline.NewAsyncPipeline(
   149  		pipeline.ProcFuncForReader(ctx, imp.Rd),
   150  		pipeline.ProcFuncForWriter(ctx, imp.Wr),
   151  		imp.Transforms,
   152  		badRowCB)
   153  	p.Start()
   154  
   155  	err = p.Wait()
   156  	if err != nil {
   157  		return 0, err
   158  	}
   159  
   160  	if rowErr != nil {
   161  		return 0, rowErr
   162  	}
   163  
   164  	return badCount, nil
   165  }
   166  
   167  // writeBadRowToCli prints a bad row in a csv form to STDERR.
   168  func writeBadRowToCli(ctx context.Context, r row.Row, sch schema.Schema, b *bytes.Buffer) error {
   169  	sqlRow, err := sqlutil.DoltRowToSqlRow(r, sch)
   170  	if err != nil {
   171  		return err
   172  	}
   173  
   174  	wr := bufio.NewWriter(b)
   175  
   176  	colValStrs := make([]*string, len(sqlRow))
   177  
   178  	for colNum, col := range sqlRow {
   179  		if col != nil {
   180  			str := sqlutil.SqlColToStr(ctx, col)
   181  			colValStrs[colNum] = &str
   182  		} else {
   183  			colValStrs[colNum] = nil
   184  		}
   185  	}
   186  
   187  	err = csv.WriteCSVRow(wr, colValStrs, ",", false)
   188  	if err != nil {
   189  		return err
   190  	}
   191  
   192  	err = wr.Flush()
   193  	if err != nil {
   194  		return err
   195  	}
   196  
   197  	str := b.String()
   198  	cli.PrintErr(str)
   199  
   200  	return nil
   201  }
   202  
   203  func MoveDataToRoot(ctx context.Context, mover *DataMover, mvOpts DataMoverOptions, root *doltdb.RootValue, updateRoot func(c context.Context, r *doltdb.RootValue) error) (*doltdb.RootValue, int64, errhand.VerboseError) {
   204  	var badCount int64
   205  	var err error
   206  	newRoot := &doltdb.RootValue{}
   207  
   208  	badCount, err = mover.Move(ctx, mover.Wr.GetSchema())
   209  
   210  	if err != nil {
   211  		if pipeline.IsTransformFailure(err) {
   212  			bdr := errhand.BuildDError("\nA bad row was encountered while moving data.")
   213  
   214  			r := pipeline.GetTransFailureRow(err)
   215  			if r != nil {
   216  				bdr.AddDetails("Bad Row: " + row.Fmt(ctx, r, mover.Wr.GetSchema()))
   217  			}
   218  
   219  			details := pipeline.GetTransFailureDetails(err)
   220  
   221  			bdr.AddDetails(details)
   222  			bdr.AddDetails("These can be ignored using the '--continue'")
   223  
   224  			return nil, badCount, bdr.Build()
   225  		}
   226  		return nil, badCount, errhand.BuildDError("An error occurred moving data:\n").AddCause(err).Build()
   227  	}
   228  
   229  	if mvOpts.WritesToTable() {
   230  		wr := mover.Wr.(DataMoverCloser)
   231  		newRoot, err = wr.Flush(ctx)
   232  		if err != nil {
   233  			return nil, badCount, errhand.BuildDError("Failed to apply changes to the table.").AddCause(err).Build()
   234  		}
   235  
   236  		rootHash, err := root.HashOf()
   237  		if err != nil {
   238  			return nil, badCount, errhand.BuildDError("Failed to hash the working value.").AddCause(err).Build()
   239  		}
   240  
   241  		newRootHash, err := newRoot.HashOf()
   242  		if rootHash != newRootHash {
   243  			err = updateRoot(ctx, newRoot)
   244  			if err != nil {
   245  				return nil, badCount, errhand.BuildDError("Failed to update the working value.").AddCause(err).Build()
   246  			}
   247  		}
   248  	}
   249  
   250  	return newRoot, badCount, nil
   251  }
   252  
   253  func MoveData(ctx context.Context, dEnv *env.DoltEnv, mover *DataMover, mvOpts DataMoverOptions) (int64, errhand.VerboseError) {
   254  	root, err := dEnv.WorkingRoot(ctx)
   255  	if err != nil {
   256  		return 0, errhand.BuildDError("Failed to fetch the working value.").AddCause(err).Build()
   257  	}
   258  	_, badCount, moveErr := MoveDataToRoot(ctx, mover, mvOpts, root, dEnv.UpdateWorkingRoot)
   259  	if moveErr != nil {
   260  		return badCount, moveErr
   261  	}
   262  	return badCount, nil
   263  }
   264  
   265  // NameMapTransform creates a pipeline transform that converts rows from inSch to outSch based on a name mapping.
   266  func NameMapTransform(ctx context.Context, vrw types.ValueReadWriter, inSch schema.Schema, outSch schema.Schema, mapper rowconv.NameMapper) (*pipeline.TransformCollection, error) {
   267  	mapping, err := rowconv.NameMapping(inSch, outSch, mapper)
   268  
   269  	if err != nil {
   270  		return nil, err
   271  	}
   272  
   273  	rconv, err := rowconv.NewImportRowConverter(ctx, vrw, mapping)
   274  
   275  	if err != nil {
   276  		return nil, err
   277  	}
   278  
   279  	transforms := pipeline.NewTransformCollection()
   280  	if !rconv.IdentityConverter {
   281  		nt := pipeline.NewNamedTransform("Mapping transform", pipeline.GetRowConvTransformFunc(rconv))
   282  		transforms.AppendTransforms(nt)
   283  	}
   284  
   285  	return transforms, nil
   286  }
   287  
   288  // SchAndTableNameFromFile reads a SQL schema file and creates a Dolt schema from it.
   289  func SchAndTableNameFromFile(ctx context.Context, path string, fs filesys.ReadableFS, root *doltdb.RootValue) (string, schema.Schema, error) {
   290  	if path != "" {
   291  		data, err := fs.ReadFile(path)
   292  
   293  		if err != nil {
   294  			return "", nil, err
   295  		}
   296  
   297  		tn, sch, err := sqlutil.ParseCreateTableStatement(ctx, root, string(data))
   298  
   299  		if err != nil {
   300  			return "", nil, fmt.Errorf("%s in schema file %s", err.Error(), path)
   301  		}
   302  
   303  		return tn, sch, nil
   304  	} else {
   305  		return "", nil, errors.New("no schema file to parse")
   306  	}
   307  }
   308  
   309  func InferSchema(ctx context.Context, root *doltdb.RootValue, rd table.TableReadCloser, tableName string, pks []string, args actions.InferenceArgs) (schema.Schema, error) {
   310  	var err error
   311  
   312  	infCols, err := actions.InferColumnTypesFromTableReader(ctx, root, rd, args)
   313  	if err != nil {
   314  		return nil, err
   315  	}
   316  
   317  	pkSet := set.NewStrSet(pks)
   318  	newCols := schema.MapColCollection(infCols, func(col schema.Column) schema.Column {
   319  		col.IsPartOfPK = pkSet.Contains(col.Name)
   320  		if col.IsPartOfPK {
   321  			hasNotNull := false
   322  			for _, constraint := range col.Constraints {
   323  				if _, ok := constraint.(schema.NotNullConstraint); ok {
   324  					hasNotNull = true
   325  					break
   326  				}
   327  			}
   328  			if !hasNotNull {
   329  				col.Constraints = append(col.Constraints, schema.NotNullConstraint{})
   330  			}
   331  		}
   332  		return col
   333  	})
   334  
   335  	// check that all provided primary keys are being used
   336  	for _, pk := range pks {
   337  		col, ok := newCols.GetByName(pk)
   338  		if !col.IsPartOfPK || !ok {
   339  			return nil, ErrProvidedPkNotFound
   340  		}
   341  	}
   342  
   343  	newCols, err = root.GenerateTagsForNewColColl(ctx, tableName, newCols)
   344  	if err != nil {
   345  		return nil, errhand.BuildDError("failed to generate new schema").AddCause(err).Build()
   346  	}
   347  
   348  	err = schema.ValidateForInsert(newCols)
   349  	if err != nil {
   350  		return nil, errhand.BuildDError("invalid schema").AddCause(err).Build()
   351  	}
   352  
   353  	return schema.SchemaFromCols(newCols)
   354  }