github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/tree/diff.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tree
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"io"
    21  )
    22  
    23  type DiffType byte
    24  
    25  const (
    26  	AddedDiff    DiffType = 0
    27  	ModifiedDiff DiffType = 1
    28  	RemovedDiff  DiffType = 2
    29  )
    30  
    31  type Diff struct {
    32  	Key      Item
    33  	From, To Item
    34  	Type     DiffType
    35  }
    36  
    37  type DiffFn func(context.Context, Diff) error
    38  
    39  // Differ computes the diff between two prolly trees.
    40  // If `considerAllRowsModified` is true, it will consider every leaf to be modified and generate a diff for every leaf. (This
    41  // is useful in cases where the schema has changed and we want to consider a leaf changed even if the byte representation
    42  // of the leaf is the same.
    43  type Differ[K ~[]byte, O Ordering[K]] struct {
    44  	from, to                *cursor
    45  	fromStop, toStop        *cursor
    46  	order                   O
    47  	considerAllRowsModified bool
    48  }
    49  
    50  func DifferFromRoots[K ~[]byte, O Ordering[K]](
    51  	ctx context.Context,
    52  	fromNs NodeStore, toNs NodeStore,
    53  	from, to Node,
    54  	order O,
    55  	considerAllRowsModified bool,
    56  ) (Differ[K, O], error) {
    57  	var fc, tc *cursor
    58  	var err error
    59  
    60  	if !from.empty() {
    61  		fc, err = newCursorAtStart(ctx, fromNs, from)
    62  		if err != nil {
    63  			return Differ[K, O]{}, err
    64  		}
    65  	} else {
    66  		fc = &cursor{}
    67  	}
    68  
    69  	if !to.empty() {
    70  		tc, err = newCursorAtStart(ctx, toNs, to)
    71  		if err != nil {
    72  			return Differ[K, O]{}, err
    73  		}
    74  	} else {
    75  		tc = &cursor{}
    76  	}
    77  
    78  	fs, err := newCursorPastEnd(ctx, fromNs, from)
    79  	if err != nil {
    80  		return Differ[K, O]{}, err
    81  	}
    82  
    83  	ts, err := newCursorPastEnd(ctx, toNs, to)
    84  	if err != nil {
    85  		return Differ[K, O]{}, err
    86  	}
    87  
    88  	return Differ[K, O]{
    89  		from:                    fc,
    90  		to:                      tc,
    91  		fromStop:                fs,
    92  		toStop:                  ts,
    93  		order:                   order,
    94  		considerAllRowsModified: considerAllRowsModified,
    95  	}, nil
    96  }
    97  
    98  func DifferFromCursors[K ~[]byte, O Ordering[K]](
    99  	ctx context.Context,
   100  	fromRoot, toRoot Node,
   101  	findStart, findStop SearchFn,
   102  	fromStore, toStore NodeStore,
   103  	order O,
   104  ) (Differ[K, O], error) {
   105  	fromStart, err := newCursorFromSearchFn(ctx, fromStore, fromRoot, findStart)
   106  	if err != nil {
   107  		return Differ[K, O]{}, err
   108  	}
   109  	toStart, err := newCursorFromSearchFn(ctx, toStore, toRoot, findStart)
   110  	if err != nil {
   111  		return Differ[K, O]{}, err
   112  	}
   113  	fromStop, err := newCursorFromSearchFn(ctx, fromStore, fromRoot, findStop)
   114  	if err != nil {
   115  		return Differ[K, O]{}, err
   116  	}
   117  	toStop, err := newCursorFromSearchFn(ctx, toStore, toRoot, findStop)
   118  	if err != nil {
   119  		return Differ[K, O]{}, err
   120  	}
   121  	return Differ[K, O]{
   122  		from:     fromStart,
   123  		to:       toStart,
   124  		fromStop: fromStop,
   125  		toStop:   toStop,
   126  		order:    order,
   127  	}, nil
   128  }
   129  
   130  func (td Differ[K, O]) Next(ctx context.Context) (diff Diff, err error) {
   131  	for td.from.Valid() && td.from.compare(td.fromStop) < 0 && td.to.Valid() && td.to.compare(td.toStop) < 0 {
   132  
   133  		f := td.from.CurrentKey()
   134  		t := td.to.CurrentKey()
   135  		cmp := td.order.Compare(K(f), K(t))
   136  
   137  		switch {
   138  		case cmp < 0:
   139  			return sendRemoved(ctx, td.from)
   140  
   141  		case cmp > 0:
   142  			return sendAdded(ctx, td.to)
   143  
   144  		case cmp == 0:
   145  			// If the cursor schema has changed, then all rows should be considered modified.
   146  			// If the cursor schema hasn't changed, rows are modified iff their bytes have changed.
   147  			if td.considerAllRowsModified || !equalcursorValues(td.from, td.to) {
   148  				return sendModified(ctx, td.from, td.to)
   149  			}
   150  
   151  			// advance both cursors since we have already determined that they are equal. This needs to be done because
   152  			// skipCommon will not advance the cursors if they are equal in a collation sensitive comparison but differ
   153  			// in a byte comparison.
   154  			if err = td.from.advance(ctx); err != nil {
   155  				return Diff{}, err
   156  			}
   157  			if err = td.to.advance(ctx); err != nil {
   158  				return Diff{}, err
   159  			}
   160  
   161  			// seek ahead to the next diff and loop again
   162  			if err = skipCommon(ctx, td.from, td.to); err != nil {
   163  				return Diff{}, err
   164  			}
   165  		}
   166  	}
   167  
   168  	if td.from.Valid() && td.from.compare(td.fromStop) < 0 {
   169  		return sendRemoved(ctx, td.from)
   170  	}
   171  	if td.to.Valid() && td.to.compare(td.toStop) < 0 {
   172  		return sendAdded(ctx, td.to)
   173  	}
   174  
   175  	return Diff{}, io.EOF
   176  }
   177  
   178  func sendRemoved(ctx context.Context, from *cursor) (diff Diff, err error) {
   179  	diff = Diff{
   180  		Type: RemovedDiff,
   181  		Key:  from.CurrentKey(),
   182  		From: from.currentValue(),
   183  	}
   184  
   185  	if err = from.advance(ctx); err != nil {
   186  		return Diff{}, err
   187  	}
   188  	return
   189  }
   190  
   191  func sendAdded(ctx context.Context, to *cursor) (diff Diff, err error) {
   192  	diff = Diff{
   193  		Type: AddedDiff,
   194  		Key:  to.CurrentKey(),
   195  		To:   to.currentValue(),
   196  	}
   197  
   198  	if err = to.advance(ctx); err != nil {
   199  		return Diff{}, err
   200  	}
   201  	return
   202  }
   203  
   204  func sendModified(ctx context.Context, from, to *cursor) (diff Diff, err error) {
   205  	diff = Diff{
   206  		Type: ModifiedDiff,
   207  		Key:  from.CurrentKey(),
   208  		From: from.currentValue(),
   209  		To:   to.currentValue(),
   210  	}
   211  
   212  	if err = from.advance(ctx); err != nil {
   213  		return Diff{}, err
   214  	}
   215  	if err = to.advance(ctx); err != nil {
   216  		return Diff{}, err
   217  	}
   218  	return
   219  }
   220  
   221  func skipCommon(ctx context.Context, from, to *cursor) (err error) {
   222  	// track when |from.parent| and |to.parent| change
   223  	// to avoid unnecessary comparisons.
   224  	parentsAreNew := true
   225  
   226  	for from.Valid() && to.Valid() {
   227  		if !equalItems(from, to) {
   228  			// found the next difference
   229  			return nil
   230  		}
   231  
   232  		if parentsAreNew {
   233  			if equalParents(from, to) {
   234  				// if our parents are equal, we can search for differences
   235  				// faster at the next highest tree Level.
   236  				if err = skipCommonParents(ctx, from, to); err != nil {
   237  					return err
   238  				}
   239  				continue
   240  			}
   241  			parentsAreNew = false
   242  		}
   243  
   244  		// if one of the cursors is at the end of its node, it will
   245  		// need to Advance its parent and fetch a new node. In this
   246  		// case we need to Compare parents again.
   247  		parentsAreNew = from.atNodeEnd() || to.atNodeEnd()
   248  
   249  		if err = from.advance(ctx); err != nil {
   250  			return err
   251  		}
   252  		if err = to.advance(ctx); err != nil {
   253  			return err
   254  		}
   255  	}
   256  
   257  	return err
   258  }
   259  
   260  func skipCommonParents(ctx context.Context, from, to *cursor) (err error) {
   261  	err = skipCommon(ctx, from.parent, to.parent)
   262  	if err != nil {
   263  		return err
   264  	}
   265  
   266  	if from.parent.Valid() {
   267  		if err = from.fetchNode(ctx); err != nil {
   268  			return err
   269  		}
   270  		from.skipToNodeStart()
   271  	} else {
   272  		from.invalidateAtEnd()
   273  	}
   274  
   275  	if to.parent.Valid() {
   276  		if err = to.fetchNode(ctx); err != nil {
   277  			return err
   278  		}
   279  		to.skipToNodeStart()
   280  	} else {
   281  		to.invalidateAtEnd()
   282  	}
   283  
   284  	return
   285  }
   286  
   287  // todo(andy): assumes equal byte representations
   288  func equalItems(from, to *cursor) bool {
   289  	return bytes.Equal(from.CurrentKey(), to.CurrentKey()) &&
   290  		bytes.Equal(from.currentValue(), to.currentValue())
   291  }
   292  
   293  func equalParents(from, to *cursor) (eq bool) {
   294  	if from.parent != nil && to.parent != nil {
   295  		eq = equalItems(from.parent, to.parent)
   296  	}
   297  	return
   298  }
   299  
   300  func equalcursorValues(from, to *cursor) bool {
   301  	return bytes.Equal(from.currentValue(), to.currentValue())
   302  }