github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/diff/diff_stat.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package diff
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"time"
    23  
    24  	"github.com/dolthub/dolt/go/cmd/dolt/errhand"
    25  	"github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable"
    26  	"github.com/dolthub/dolt/go/libraries/doltcore/row"
    27  	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
    28  	"github.com/dolthub/dolt/go/store/diff"
    29  	"github.com/dolthub/dolt/go/store/prolly"
    30  	"github.com/dolthub/dolt/go/store/prolly/tree"
    31  	"github.com/dolthub/dolt/go/store/types"
    32  	"github.com/dolthub/dolt/go/store/val"
    33  )
    34  
    35  var ErrPrimaryKeySetChanged = errors.New("primary key set changed")
    36  
    37  type DiffStatProgress struct {
    38  	Adds, Removes, Changes, CellChanges, NewRowSize, OldRowSize, NewCellSize, OldCellSize uint64
    39  }
    40  
    41  type prollyReporter func(ctx context.Context, vMapping val.OrdinalMapping, fromD, toD val.TupleDesc, change tree.Diff, ch chan<- DiffStatProgress) error
    42  type nomsReporter func(ctx context.Context, change *diff.Difference, fromSch, toSch schema.Schema, ch chan<- DiffStatProgress) error
    43  
    44  // Stat reports a stat of diff changes between two values
    45  // todo: make package private once dolthub is migrated
    46  func Stat(ctx context.Context, ch chan DiffStatProgress, from, to durable.Index, fromSch, toSch schema.Schema) (err error) {
    47  	fc, err := from.Count()
    48  	if err != nil {
    49  		return err
    50  	}
    51  	tc, err := to.Count()
    52  	if err != nil {
    53  		return err
    54  	}
    55  	ch <- DiffStatProgress{OldRowSize: fc, NewRowSize: tc}
    56  
    57  	fk, tk := schema.IsKeyless(fromSch), schema.IsKeyless(toSch)
    58  	var keyless bool
    59  	if fk && tk {
    60  		keyless = true
    61  	} else if fk != tk {
    62  		return fmt.Errorf("cannot perform a diff between keyless and keyed schema")
    63  	}
    64  
    65  	if types.IsFormat_DOLT(from.Format()) {
    66  		return diffProllyTrees(ctx, ch, keyless, from, to, fromSch, toSch)
    67  	}
    68  
    69  	return diffNomsMaps(ctx, ch, keyless, from, to, fromSch, toSch)
    70  }
    71  
    72  // StatForTableDelta pushes diff stat progress messages for the table delta given to the channel given
    73  func StatForTableDelta(ctx context.Context, ch chan DiffStatProgress, td TableDelta) error {
    74  	fromSch, toSch, err := td.GetSchemas(ctx)
    75  	if err != nil {
    76  		return errhand.BuildDError("cannot retrieve schema for table %s", td.ToName).AddCause(err).Build()
    77  	}
    78  
    79  	if !schema.ArePrimaryKeySetsDiffable(td.Format(), fromSch, toSch) {
    80  		return fmt.Errorf("failed to compute diff stat for table %s: %w", td.CurName(), ErrPrimaryKeySetChanged)
    81  	}
    82  
    83  	keyless, err := td.IsKeyless(ctx)
    84  	if err != nil {
    85  		return err
    86  	}
    87  
    88  	fromRows, toRows, err := td.GetRowData(ctx)
    89  	if err != nil {
    90  		return err
    91  	}
    92  
    93  	if types.IsFormat_DOLT(td.Format()) {
    94  		return diffProllyTrees(ctx, ch, keyless, fromRows, toRows, fromSch, toSch)
    95  	} else {
    96  		return diffNomsMaps(ctx, ch, keyless, fromRows, toRows, fromSch, toSch)
    97  	}
    98  }
    99  
   100  func diffProllyTrees(ctx context.Context, ch chan DiffStatProgress, keyless bool, from, to durable.Index, fromSch, toSch schema.Schema) error {
   101  	_, vMapping, err := schema.MapSchemaBasedOnTagAndName(fromSch, toSch)
   102  	if err != nil {
   103  		return err
   104  	}
   105  
   106  	var f, t prolly.Map
   107  	if from != nil {
   108  		f = durable.ProllyMapFromIndex(from)
   109  	}
   110  	if to != nil {
   111  		t = durable.ProllyMapFromIndex(to)
   112  
   113  	}
   114  
   115  	_, fVD := f.Descriptors()
   116  	_, tVD := t.Descriptors()
   117  
   118  	var rpr prollyReporter
   119  	if keyless {
   120  		rpr = reportKeylessChanges
   121  	} else {
   122  		var fc uint64
   123  		if from != nil {
   124  			fc, err = from.Count()
   125  			if err != nil {
   126  				return err
   127  			}
   128  		}
   129  
   130  		cfc := uint64(len(fromSch.GetAllCols().GetColumns())) * fc
   131  		var tc uint64
   132  		if to != nil {
   133  			tc, err = to.Count()
   134  			if err != nil {
   135  				return err
   136  			}
   137  		}
   138  
   139  		ctc := uint64(len(toSch.GetAllCols().GetColumns())) * tc
   140  		rpr = reportPkChanges
   141  		ch <- DiffStatProgress{
   142  			OldRowSize:  fc,
   143  			NewRowSize:  tc,
   144  			OldCellSize: cfc,
   145  			NewCellSize: ctc,
   146  		}
   147  	}
   148  
   149  	// TODO: Use `vMapping` to determine whether columns have been added or removed. If so, then all rows should
   150  	// count as modifications in the diff.
   151  	considerAllRowsModified := false
   152  	err = prolly.DiffMaps(ctx, f, t, considerAllRowsModified, func(ctx context.Context, diff tree.Diff) error {
   153  		return rpr(ctx, vMapping, fVD, tVD, diff, ch)
   154  	})
   155  	if err != nil && err != io.EOF {
   156  		return err
   157  	}
   158  	return nil
   159  }
   160  
   161  func diffNomsMaps(ctx context.Context, ch chan DiffStatProgress, keyless bool, fromRows durable.Index, toRows durable.Index, fromSch, toSch schema.Schema) error {
   162  	var rpr nomsReporter
   163  	if keyless {
   164  		rpr = reportNomsKeylessChanges
   165  	} else {
   166  		fc, err := fromRows.Count()
   167  		if err != nil {
   168  			return err
   169  		}
   170  		cfc := uint64(len(fromSch.GetAllCols().GetColumns())) * fc
   171  		tc, err := toRows.Count()
   172  		if err != nil {
   173  			return err
   174  		}
   175  		ctc := uint64(len(toSch.GetAllCols().GetColumns())) * tc
   176  		rpr = reportNomsPkChanges
   177  		ch <- DiffStatProgress{
   178  			OldRowSize:  fc,
   179  			NewRowSize:  tc,
   180  			OldCellSize: cfc,
   181  			NewCellSize: ctc,
   182  		}
   183  	}
   184  
   185  	return statWithReporter(ctx, ch, durable.NomsMapFromIndex(fromRows), durable.NomsMapFromIndex(toRows), rpr, fromSch, toSch)
   186  }
   187  
   188  func statWithReporter(ctx context.Context, ch chan DiffStatProgress, from, to types.Map, rpr nomsReporter, fromSch, toSch schema.Schema) (err error) {
   189  	ad := NewAsyncDiffer(1024)
   190  	ad.Start(ctx, from, to)
   191  	defer func() {
   192  		if cerr := ad.Close(); cerr != nil && err == nil {
   193  			err = cerr
   194  		}
   195  	}()
   196  
   197  	var more bool
   198  	var diffs []*diff.Difference
   199  	for {
   200  		diffs, more, err = ad.GetDiffs(100, time.Millisecond)
   201  		if err != nil {
   202  			return err
   203  		}
   204  
   205  		for _, df := range diffs {
   206  			err = rpr(ctx, df, fromSch, toSch, ch)
   207  			if err != nil {
   208  				return err
   209  			}
   210  		}
   211  
   212  		if !more {
   213  			break
   214  		}
   215  	}
   216  
   217  	return nil
   218  }
   219  
   220  func reportPkChanges(ctx context.Context, vMapping val.OrdinalMapping, fromD, toD val.TupleDesc, change tree.Diff, ch chan<- DiffStatProgress) error {
   221  	var stat DiffStatProgress
   222  	switch change.Type {
   223  	case tree.AddedDiff:
   224  		stat.Adds++
   225  	case tree.RemovedDiff:
   226  		stat.Removes++
   227  	case tree.ModifiedDiff:
   228  		stat.CellChanges = prollyCountCellDiff(vMapping, fromD, toD, val.Tuple(change.From), val.Tuple(change.To))
   229  		stat.Changes++
   230  	default:
   231  		return errors.New("unknown change type")
   232  	}
   233  	select {
   234  	case ch <- stat:
   235  		return nil
   236  	case <-ctx.Done():
   237  		return ctx.Err()
   238  	}
   239  }
   240  
   241  func reportKeylessChanges(ctx context.Context, vMapping val.OrdinalMapping, fromD, toD val.TupleDesc, change tree.Diff, ch chan<- DiffStatProgress) error {
   242  	var stat DiffStatProgress
   243  	var n, n2 uint64
   244  	switch change.Type {
   245  	case tree.AddedDiff:
   246  		n, _ = toD.GetUint64(0, val.Tuple(change.To))
   247  		stat.Adds += n
   248  	case tree.RemovedDiff:
   249  		n, _ = fromD.GetUint64(0, val.Tuple(change.From))
   250  		stat.Removes += n
   251  	case tree.ModifiedDiff:
   252  		n, _ = fromD.GetUint64(0, val.Tuple(change.From))
   253  		n2, _ = toD.GetUint64(0, val.Tuple(change.To))
   254  		if n < n2 {
   255  			stat.Adds += n2 - n
   256  		} else {
   257  			stat.Removes += n - n2
   258  		}
   259  	default:
   260  		return errors.New("unknown change type")
   261  	}
   262  	select {
   263  	case ch <- stat:
   264  		return nil
   265  	case <-ctx.Done():
   266  		return ctx.Err()
   267  	}
   268  }
   269  
   270  // prollyCountCellDiff counts the number of changes columns between two tuples
   271  // |from| and |to|. |mapping| should map columns from |from| to |to|.
   272  func prollyCountCellDiff(mapping val.OrdinalMapping, fromD, toD val.TupleDesc, from val.Tuple, to val.Tuple) uint64 {
   273  	newCols := uint64(toD.Count())
   274  	changed := uint64(0)
   275  	for i, j := range mapping {
   276  		newCols--
   277  		if j == -1 {
   278  			// column was dropped
   279  			changed++
   280  			continue
   281  		}
   282  
   283  		if fromD.Types[i].Enc != toD.Types[j].Enc {
   284  			// column type is different
   285  			changed++
   286  			continue
   287  		}
   288  
   289  		if fromD.CompareField(toD.GetField(j, to), i, from) != 0 {
   290  			// column was modified
   291  			changed++
   292  			continue
   293  		}
   294  	}
   295  
   296  	// some columns were added
   297  	changed += newCols
   298  	return changed
   299  }
   300  
   301  func reportNomsPkChanges(ctx context.Context, change *diff.Difference, fromSch, toSch schema.Schema, ch chan<- DiffStatProgress) error {
   302  	var stat DiffStatProgress
   303  	switch change.ChangeType {
   304  	case types.DiffChangeAdded:
   305  		stat = DiffStatProgress{Adds: 1}
   306  	case types.DiffChangeRemoved:
   307  		stat = DiffStatProgress{Removes: 1}
   308  	case types.DiffChangeModified:
   309  		oldTuple := change.OldValue.(types.Tuple)
   310  		newTuple := change.NewValue.(types.Tuple)
   311  		cellChanges, err := row.CountCellDiffs(oldTuple, newTuple, fromSch, toSch)
   312  		if err != nil {
   313  			return err
   314  		}
   315  		stat = DiffStatProgress{Changes: 1, CellChanges: cellChanges}
   316  	default:
   317  		return errors.New("unknown change type")
   318  	}
   319  	select {
   320  	case ch <- stat:
   321  		return nil
   322  	case <-ctx.Done():
   323  		return ctx.Err()
   324  	}
   325  }
   326  
   327  func reportNomsKeylessChanges(ctx context.Context, change *diff.Difference, fromSch, toSch schema.Schema, ch chan<- DiffStatProgress) error {
   328  	var oldCard uint64
   329  	if change.OldValue != nil {
   330  		v, err := change.OldValue.(types.Tuple).Get(row.KeylessCardinalityValIdx)
   331  		if err != nil {
   332  			return err
   333  		}
   334  		oldCard = uint64(v.(types.Uint))
   335  	}
   336  
   337  	var newCard uint64
   338  	if change.NewValue != nil {
   339  		v, err := change.NewValue.(types.Tuple).Get(row.KeylessCardinalityValIdx)
   340  		if err != nil {
   341  			return err
   342  		}
   343  		newCard = uint64(v.(types.Uint))
   344  	}
   345  
   346  	var stat DiffStatProgress
   347  	delta := int64(newCard) - int64(oldCard)
   348  	if delta > 0 {
   349  		stat = DiffStatProgress{Adds: uint64(delta)}
   350  	} else if delta < 0 {
   351  		stat = DiffStatProgress{Removes: uint64(-delta)}
   352  	} else {
   353  		return fmt.Errorf("diff with delta = 0 for key: %s", change.KeyValue.HumanReadableString())
   354  	}
   355  
   356  	select {
   357  	case ch <- stat:
   358  		return nil
   359  	case <-ctx.Done():
   360  		return ctx.Err()
   361  	}
   362  }