github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/diff/async_differ.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package diff
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"golang.org/x/sync/errgroup"
    23  
    24  	"github.com/dolthub/dolt/go/libraries/doltcore/row"
    25  	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
    26  	"github.com/dolthub/dolt/go/libraries/utils/async"
    27  	"github.com/dolthub/dolt/go/store/diff"
    28  	"github.com/dolthub/dolt/go/store/types"
    29  )
    30  
    31  func NewRowDiffer(ctx context.Context, format *types.NomsBinFormat, fromSch, toSch schema.Schema, buf int) RowDiffer {
    32  	ad := NewAsyncDiffer(buf)
    33  
    34  	// Returns an EmptyRowDiffer if the two schemas are not diffable.
    35  	if !schema.ArePrimaryKeySetsDiffable(format, fromSch, toSch) {
    36  		return &EmptyRowDiffer{}
    37  	}
    38  
    39  	if schema.IsKeyless(fromSch) || schema.IsKeyless(toSch) {
    40  		return &keylessDiffer{AsyncDiffer: ad}
    41  	}
    42  
    43  	return ad
    44  }
    45  
    46  // todo: make package private
    47  type AsyncDiffer struct {
    48  	diffChan   chan diff.Difference
    49  	bufferSize int
    50  
    51  	eg       *errgroup.Group
    52  	egCtx    context.Context
    53  	egCancel func()
    54  
    55  	diffStats map[types.DiffChangeType]uint64
    56  }
    57  
    58  var _ RowDiffer = &AsyncDiffer{}
    59  
    60  // todo: make package private once dolthub is migrated
    61  func NewAsyncDiffer(bufferedDiffs int) *AsyncDiffer {
    62  	return &AsyncDiffer{
    63  		diffChan:   make(chan diff.Difference, bufferedDiffs),
    64  		bufferSize: bufferedDiffs,
    65  		egCtx:      context.Background(),
    66  		egCancel:   func() {},
    67  		diffStats:  make(map[types.DiffChangeType]uint64),
    68  	}
    69  }
    70  
    71  func tableDontDescendLists(v1, v2 types.Value) bool {
    72  	kind := v1.Kind()
    73  	return !types.IsPrimitiveKind(kind) && kind != types.TupleKind && kind == v2.Kind() && kind != types.RefKind
    74  }
    75  
    76  func (ad *AsyncDiffer) Start(ctx context.Context, from, to types.Map) {
    77  	ad.start(ctx, func(ctx context.Context) error {
    78  		return diff.Diff(ctx, from, to, ad.diffChan, true, tableDontDescendLists)
    79  	})
    80  }
    81  
    82  func (ad *AsyncDiffer) StartWithRange(ctx context.Context, from, to types.Map, start types.Value, inRange types.ValueInRange) {
    83  	ad.start(ctx, func(ctx context.Context) error {
    84  		return diff.DiffMapRange(ctx, from, to, start, inRange, ad.diffChan, true, tableDontDescendLists)
    85  	})
    86  }
    87  
    88  func (ad *AsyncDiffer) start(ctx context.Context, diffFunc func(ctx context.Context) error) {
    89  	ad.eg, ad.egCtx = errgroup.WithContext(ctx)
    90  	ad.egCancel = async.GoWithCancel(ad.egCtx, ad.eg, func(ctx context.Context) (err error) {
    91  		defer close(ad.diffChan)
    92  		defer func() {
    93  			if r := recover(); r != nil {
    94  				err = fmt.Errorf("panic in diff.Diff: %v", r)
    95  			}
    96  		}()
    97  		return diffFunc(ctx)
    98  	})
    99  }
   100  
   101  func (ad *AsyncDiffer) Close() error {
   102  	ad.egCancel()
   103  	return ad.eg.Wait()
   104  }
   105  
   106  func (ad *AsyncDiffer) getDiffs(numDiffs int, timeoutChan <-chan time.Time, pred diffPredicate) ([]*diff.Difference, bool, error) {
   107  	diffs := make([]*diff.Difference, 0, numDiffs)
   108  	for {
   109  		select {
   110  		case d, more := <-ad.diffChan:
   111  			if more {
   112  				if pred(&d) {
   113  					ad.diffStats[d.ChangeType]++
   114  					diffs = append(diffs, &d)
   115  				}
   116  				if numDiffs != 0 && numDiffs == len(diffs) {
   117  					return diffs, true, nil
   118  				}
   119  			} else {
   120  				return diffs, false, ad.eg.Wait()
   121  			}
   122  		case <-timeoutChan:
   123  			return diffs, true, nil
   124  		case <-ad.egCtx.Done():
   125  			return nil, false, ad.eg.Wait()
   126  		}
   127  	}
   128  }
   129  
   130  var forever <-chan time.Time = make(chan time.Time)
   131  
   132  type diffPredicate func(*diff.Difference) bool
   133  
   134  var alwaysTruePredicate diffPredicate = func(*diff.Difference) bool {
   135  	return true
   136  }
   137  
   138  func hasChangeTypePredicate(changeType types.DiffChangeType) diffPredicate {
   139  	return func(d *diff.Difference) bool {
   140  		return d.ChangeType == changeType
   141  	}
   142  }
   143  
   144  func (ad *AsyncDiffer) GetDiffs(numDiffs int, timeout time.Duration) ([]*diff.Difference, bool, error) {
   145  	if timeout < 0 {
   146  		return ad.GetDiffsWithoutTimeout(numDiffs)
   147  	}
   148  	return ad.getDiffs(numDiffs, time.After(timeout), alwaysTruePredicate)
   149  }
   150  
   151  func (ad *AsyncDiffer) GetDiffsWithFilter(numDiffs int, timeout time.Duration, filterByChangeType types.DiffChangeType) ([]*diff.Difference, bool, error) {
   152  	if timeout < 0 {
   153  		return ad.GetDiffsWithoutTimeoutWithFilter(numDiffs, filterByChangeType)
   154  	}
   155  	return ad.getDiffs(numDiffs, time.After(timeout), hasChangeTypePredicate(filterByChangeType))
   156  }
   157  
   158  func (ad *AsyncDiffer) GetDiffsWithoutTimeoutWithFilter(numDiffs int, filterByChangeType types.DiffChangeType) ([]*diff.Difference, bool, error) {
   159  	return ad.getDiffs(numDiffs, forever, hasChangeTypePredicate(filterByChangeType))
   160  }
   161  
   162  func (ad *AsyncDiffer) GetDiffsWithoutTimeout(numDiffs int) ([]*diff.Difference, bool, error) {
   163  	return ad.getDiffs(numDiffs, forever, alwaysTruePredicate)
   164  }
   165  
   166  type keylessDiffer struct {
   167  	*AsyncDiffer
   168  
   169  	df         diff.Difference
   170  	copiesLeft uint64
   171  }
   172  
   173  var _ RowDiffer = &keylessDiffer{}
   174  
   175  func (kd *keylessDiffer) getDiffs(numDiffs int, timeoutChan <-chan time.Time, pred diffPredicate) ([]*diff.Difference, bool, error) {
   176  	diffs := make([]*diff.Difference, numDiffs)
   177  	idx := 0
   178  
   179  	for {
   180  		// first populate |diffs| with copies of |kd.df|
   181  
   182  		cpy := kd.df // save a copy of kd.df to reference
   183  		for (idx < numDiffs) && (kd.copiesLeft > 0) {
   184  			diffs[idx] = &cpy
   185  			idx++
   186  			kd.copiesLeft--
   187  		}
   188  		if idx == numDiffs {
   189  			return diffs, true, nil
   190  		}
   191  
   192  		// then find the next Difference the satisfies |pred|
   193  		match := false
   194  		for !match {
   195  			select {
   196  			case <-timeoutChan:
   197  				return diffs, true, nil
   198  
   199  			case <-kd.egCtx.Done():
   200  				return nil, false, kd.eg.Wait()
   201  
   202  			case d, more := <-kd.diffChan:
   203  				if !more {
   204  					return diffs[:idx], more, nil
   205  				}
   206  
   207  				var err error
   208  				kd.df, kd.copiesLeft, err = convertDiff(d)
   209  				if err != nil {
   210  					return nil, false, err
   211  				}
   212  
   213  				match = pred(&kd.df)
   214  			}
   215  		}
   216  	}
   217  }
   218  
   219  func (kd *keylessDiffer) GetDiffs(numDiffs int, timeout time.Duration) ([]*diff.Difference, bool, error) {
   220  	if timeout < 0 {
   221  		return kd.getDiffs(numDiffs, forever, alwaysTruePredicate)
   222  	}
   223  	return kd.getDiffs(numDiffs, time.After(timeout), alwaysTruePredicate)
   224  }
   225  
   226  func (kd *keylessDiffer) GetDiffsWithFilter(numDiffs int, timeout time.Duration, filterByChangeType types.DiffChangeType) ([]*diff.Difference, bool, error) {
   227  	if timeout < 0 {
   228  		return kd.getDiffs(numDiffs, forever, hasChangeTypePredicate(filterByChangeType))
   229  	}
   230  	return kd.getDiffs(numDiffs, time.After(timeout), hasChangeTypePredicate(filterByChangeType))
   231  }
   232  
   233  // convertDiff reports the cardinality of a change,
   234  // and converts updates to adds or deletes
   235  func convertDiff(df diff.Difference) (diff.Difference, uint64, error) {
   236  	var oldCard uint64
   237  	if df.OldValue != nil {
   238  		v, err := df.OldValue.(types.Tuple).Get(row.KeylessCardinalityValIdx)
   239  		if err != nil {
   240  			return df, 0, err
   241  		}
   242  		oldCard = uint64(v.(types.Uint))
   243  	}
   244  
   245  	var newCard uint64
   246  	if df.NewValue != nil {
   247  		v, err := df.NewValue.(types.Tuple).Get(row.KeylessCardinalityValIdx)
   248  		if err != nil {
   249  			return df, 0, err
   250  		}
   251  		newCard = uint64(v.(types.Uint))
   252  	}
   253  
   254  	switch df.ChangeType {
   255  	case types.DiffChangeRemoved:
   256  		return df, oldCard, nil
   257  
   258  	case types.DiffChangeAdded:
   259  		return df, newCard, nil
   260  
   261  	case types.DiffChangeModified:
   262  		delta := int64(newCard) - int64(oldCard)
   263  		if delta > 0 {
   264  			df.ChangeType = types.DiffChangeAdded
   265  			df.OldValue = nil
   266  			return df, uint64(delta), nil
   267  		} else if delta < 0 {
   268  			df.ChangeType = types.DiffChangeRemoved
   269  			df.NewValue = nil
   270  			return df, uint64(-delta), nil
   271  		} else {
   272  			panic(fmt.Sprintf("diff with delta = 0 for key: %s", df.KeyValue.HumanReadableString()))
   273  		}
   274  	default:
   275  		return df, 0, fmt.Errorf("unexpected DiffChange type %d", df.ChangeType)
   276  	}
   277  }
   278  
   279  type EmptyRowDiffer struct {
   280  }
   281  
   282  var _ RowDiffer = &EmptyRowDiffer{}
   283  
   284  func (e EmptyRowDiffer) Start(ctx context.Context, from, to types.Map) {
   285  }
   286  
   287  func (e EmptyRowDiffer) StartWithRange(ctx context.Context, from, to types.Map, start types.Value, inRange types.ValueInRange) {
   288  
   289  }
   290  
   291  func (e EmptyRowDiffer) GetDiffs(numDiffs int, timeout time.Duration) ([]*diff.Difference, bool, error) {
   292  	return nil, false, nil
   293  }
   294  
   295  func (e EmptyRowDiffer) GetDiffsWithFilter(numDiffs int, timeout time.Duration, filterByChangeType types.DiffChangeType) ([]*diff.Difference, bool, error) {
   296  	return nil, false, nil
   297  }
   298  
   299  func (e EmptyRowDiffer) Close() error {
   300  	return nil
   301  }