github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/diff/diff_stat.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package diff 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 "time" 23 24 "github.com/dolthub/dolt/go/cmd/dolt/errhand" 25 "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" 26 "github.com/dolthub/dolt/go/libraries/doltcore/row" 27 "github.com/dolthub/dolt/go/libraries/doltcore/schema" 28 "github.com/dolthub/dolt/go/store/diff" 29 "github.com/dolthub/dolt/go/store/prolly" 30 "github.com/dolthub/dolt/go/store/prolly/tree" 31 "github.com/dolthub/dolt/go/store/types" 32 "github.com/dolthub/dolt/go/store/val" 33 ) 34 35 var ErrPrimaryKeySetChanged = errors.New("primary key set changed") 36 37 type DiffStatProgress struct { 38 Adds, Removes, Changes, CellChanges, NewRowSize, OldRowSize, NewCellSize, OldCellSize uint64 39 } 40 41 type prollyReporter func(ctx context.Context, vMapping val.OrdinalMapping, fromD, toD val.TupleDesc, change tree.Diff, ch chan<- DiffStatProgress) error 42 type nomsReporter func(ctx context.Context, change *diff.Difference, fromSch, toSch schema.Schema, ch chan<- DiffStatProgress) error 43 44 // Stat reports a stat of diff changes between two values 45 // todo: make package private once dolthub is migrated 46 func Stat(ctx context.Context, ch chan DiffStatProgress, from, to durable.Index, fromSch, toSch schema.Schema) (err error) { 47 fc, err := from.Count() 48 if err != nil { 49 return err 50 } 51 tc, err := to.Count() 52 if err != nil { 53 return err 54 } 55 ch <- DiffStatProgress{OldRowSize: fc, NewRowSize: tc} 56 57 fk, tk := schema.IsKeyless(fromSch), schema.IsKeyless(toSch) 58 var keyless bool 59 if fk && tk { 60 keyless = true 61 } else if fk != tk { 62 return fmt.Errorf("cannot perform a diff between keyless and keyed schema") 63 } 64 65 if types.IsFormat_DOLT(from.Format()) { 66 return diffProllyTrees(ctx, ch, keyless, from, to, fromSch, toSch) 67 } 68 69 return diffNomsMaps(ctx, ch, keyless, from, to, fromSch, toSch) 70 } 71 72 // StatForTableDelta pushes diff stat progress messages for the table delta given to the channel given 73 func StatForTableDelta(ctx context.Context, ch chan DiffStatProgress, td TableDelta) error { 74 fromSch, toSch, err := td.GetSchemas(ctx) 75 if err != nil { 76 return errhand.BuildDError("cannot retrieve schema for table %s", td.ToName).AddCause(err).Build() 77 } 78 79 if !schema.ArePrimaryKeySetsDiffable(td.Format(), fromSch, toSch) { 80 return fmt.Errorf("failed to compute diff stat for table %s: %w", td.CurName(), ErrPrimaryKeySetChanged) 81 } 82 83 keyless, err := td.IsKeyless(ctx) 84 if err != nil { 85 return err 86 } 87 88 fromRows, toRows, err := td.GetRowData(ctx) 89 if err != nil { 90 return err 91 } 92 93 if types.IsFormat_DOLT(td.Format()) { 94 return diffProllyTrees(ctx, ch, keyless, fromRows, toRows, fromSch, toSch) 95 } else { 96 return diffNomsMaps(ctx, ch, keyless, fromRows, toRows, fromSch, toSch) 97 } 98 } 99 100 func diffProllyTrees(ctx context.Context, ch chan DiffStatProgress, keyless bool, from, to durable.Index, fromSch, toSch schema.Schema) error { 101 _, vMapping, err := schema.MapSchemaBasedOnTagAndName(fromSch, toSch) 102 if err != nil { 103 return err 104 } 105 106 var f, t prolly.Map 107 if from != nil { 108 f = durable.ProllyMapFromIndex(from) 109 } 110 if to != nil { 111 t = durable.ProllyMapFromIndex(to) 112 113 } 114 115 _, fVD := f.Descriptors() 116 _, tVD := t.Descriptors() 117 118 var rpr prollyReporter 119 if keyless { 120 rpr = reportKeylessChanges 121 } else { 122 var fc uint64 123 if from != nil { 124 fc, err = from.Count() 125 if err != nil { 126 return err 127 } 128 } 129 130 cfc := uint64(len(fromSch.GetAllCols().GetColumns())) * fc 131 var tc uint64 132 if to != nil { 133 tc, err = to.Count() 134 if err != nil { 135 return err 136 } 137 } 138 139 ctc := uint64(len(toSch.GetAllCols().GetColumns())) * tc 140 rpr = reportPkChanges 141 ch <- DiffStatProgress{ 142 OldRowSize: fc, 143 NewRowSize: tc, 144 OldCellSize: cfc, 145 NewCellSize: ctc, 146 } 147 } 148 149 // TODO: Use `vMapping` to determine whether columns have been added or removed. If so, then all rows should 150 // count as modifications in the diff. 151 considerAllRowsModified := false 152 err = prolly.DiffMaps(ctx, f, t, considerAllRowsModified, func(ctx context.Context, diff tree.Diff) error { 153 return rpr(ctx, vMapping, fVD, tVD, diff, ch) 154 }) 155 if err != nil && err != io.EOF { 156 return err 157 } 158 return nil 159 } 160 161 func diffNomsMaps(ctx context.Context, ch chan DiffStatProgress, keyless bool, fromRows durable.Index, toRows durable.Index, fromSch, toSch schema.Schema) error { 162 var rpr nomsReporter 163 if keyless { 164 rpr = reportNomsKeylessChanges 165 } else { 166 fc, err := fromRows.Count() 167 if err != nil { 168 return err 169 } 170 cfc := uint64(len(fromSch.GetAllCols().GetColumns())) * fc 171 tc, err := toRows.Count() 172 if err != nil { 173 return err 174 } 175 ctc := uint64(len(toSch.GetAllCols().GetColumns())) * tc 176 rpr = reportNomsPkChanges 177 ch <- DiffStatProgress{ 178 OldRowSize: fc, 179 NewRowSize: tc, 180 OldCellSize: cfc, 181 NewCellSize: ctc, 182 } 183 } 184 185 return statWithReporter(ctx, ch, durable.NomsMapFromIndex(fromRows), durable.NomsMapFromIndex(toRows), rpr, fromSch, toSch) 186 } 187 188 func statWithReporter(ctx context.Context, ch chan DiffStatProgress, from, to types.Map, rpr nomsReporter, fromSch, toSch schema.Schema) (err error) { 189 ad := NewAsyncDiffer(1024) 190 ad.Start(ctx, from, to) 191 defer func() { 192 if cerr := ad.Close(); cerr != nil && err == nil { 193 err = cerr 194 } 195 }() 196 197 var more bool 198 var diffs []*diff.Difference 199 for { 200 diffs, more, err = ad.GetDiffs(100, time.Millisecond) 201 if err != nil { 202 return err 203 } 204 205 for _, df := range diffs { 206 err = rpr(ctx, df, fromSch, toSch, ch) 207 if err != nil { 208 return err 209 } 210 } 211 212 if !more { 213 break 214 } 215 } 216 217 return nil 218 } 219 220 func reportPkChanges(ctx context.Context, vMapping val.OrdinalMapping, fromD, toD val.TupleDesc, change tree.Diff, ch chan<- DiffStatProgress) error { 221 var stat DiffStatProgress 222 switch change.Type { 223 case tree.AddedDiff: 224 stat.Adds++ 225 case tree.RemovedDiff: 226 stat.Removes++ 227 case tree.ModifiedDiff: 228 stat.CellChanges = prollyCountCellDiff(vMapping, fromD, toD, val.Tuple(change.From), val.Tuple(change.To)) 229 stat.Changes++ 230 default: 231 return errors.New("unknown change type") 232 } 233 select { 234 case ch <- stat: 235 return nil 236 case <-ctx.Done(): 237 return ctx.Err() 238 } 239 } 240 241 func reportKeylessChanges(ctx context.Context, vMapping val.OrdinalMapping, fromD, toD val.TupleDesc, change tree.Diff, ch chan<- DiffStatProgress) error { 242 var stat DiffStatProgress 243 var n, n2 uint64 244 switch change.Type { 245 case tree.AddedDiff: 246 n, _ = toD.GetUint64(0, val.Tuple(change.To)) 247 stat.Adds += n 248 case tree.RemovedDiff: 249 n, _ = fromD.GetUint64(0, val.Tuple(change.From)) 250 stat.Removes += n 251 case tree.ModifiedDiff: 252 n, _ = fromD.GetUint64(0, val.Tuple(change.From)) 253 n2, _ = toD.GetUint64(0, val.Tuple(change.To)) 254 if n < n2 { 255 stat.Adds += n2 - n 256 } else { 257 stat.Removes += n - n2 258 } 259 default: 260 return errors.New("unknown change type") 261 } 262 select { 263 case ch <- stat: 264 return nil 265 case <-ctx.Done(): 266 return ctx.Err() 267 } 268 } 269 270 // prollyCountCellDiff counts the number of changes columns between two tuples 271 // |from| and |to|. |mapping| should map columns from |from| to |to|. 272 func prollyCountCellDiff(mapping val.OrdinalMapping, fromD, toD val.TupleDesc, from val.Tuple, to val.Tuple) uint64 { 273 newCols := uint64(toD.Count()) 274 changed := uint64(0) 275 for i, j := range mapping { 276 newCols-- 277 if j == -1 { 278 // column was dropped 279 changed++ 280 continue 281 } 282 283 if fromD.Types[i].Enc != toD.Types[j].Enc { 284 // column type is different 285 changed++ 286 continue 287 } 288 289 if fromD.CompareField(toD.GetField(j, to), i, from) != 0 { 290 // column was modified 291 changed++ 292 continue 293 } 294 } 295 296 // some columns were added 297 changed += newCols 298 return changed 299 } 300 301 func reportNomsPkChanges(ctx context.Context, change *diff.Difference, fromSch, toSch schema.Schema, ch chan<- DiffStatProgress) error { 302 var stat DiffStatProgress 303 switch change.ChangeType { 304 case types.DiffChangeAdded: 305 stat = DiffStatProgress{Adds: 1} 306 case types.DiffChangeRemoved: 307 stat = DiffStatProgress{Removes: 1} 308 case types.DiffChangeModified: 309 oldTuple := change.OldValue.(types.Tuple) 310 newTuple := change.NewValue.(types.Tuple) 311 cellChanges, err := row.CountCellDiffs(oldTuple, newTuple, fromSch, toSch) 312 if err != nil { 313 return err 314 } 315 stat = DiffStatProgress{Changes: 1, CellChanges: cellChanges} 316 default: 317 return errors.New("unknown change type") 318 } 319 select { 320 case ch <- stat: 321 return nil 322 case <-ctx.Done(): 323 return ctx.Err() 324 } 325 } 326 327 func reportNomsKeylessChanges(ctx context.Context, change *diff.Difference, fromSch, toSch schema.Schema, ch chan<- DiffStatProgress) error { 328 var oldCard uint64 329 if change.OldValue != nil { 330 v, err := change.OldValue.(types.Tuple).Get(row.KeylessCardinalityValIdx) 331 if err != nil { 332 return err 333 } 334 oldCard = uint64(v.(types.Uint)) 335 } 336 337 var newCard uint64 338 if change.NewValue != nil { 339 v, err := change.NewValue.(types.Tuple).Get(row.KeylessCardinalityValIdx) 340 if err != nil { 341 return err 342 } 343 newCard = uint64(v.(types.Uint)) 344 } 345 346 var stat DiffStatProgress 347 delta := int64(newCard) - int64(oldCard) 348 if delta > 0 { 349 stat = DiffStatProgress{Adds: uint64(delta)} 350 } else if delta < 0 { 351 stat = DiffStatProgress{Removes: uint64(-delta)} 352 } else { 353 return fmt.Errorf("diff with delta = 0 for key: %s", change.KeyValue.HumanReadableString()) 354 } 355 356 select { 357 case ch <- stat: 358 return nil 359 case <-ctx.Done(): 360 return ctx.Err() 361 } 362 }