github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/enginetest/validation.go (about)

     1  // Copyright 2020 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package enginetest
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io"
    21  
    22  	"github.com/dolthub/go-mysql-server/sql"
    23  	"github.com/dolthub/go-mysql-server/sql/mysql_db"
    24  	sqltypes "github.com/dolthub/go-mysql-server/sql/types"
    25  
    26  	"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
    27  	"github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable"
    28  	"github.com/dolthub/dolt/go/libraries/doltcore/ref"
    29  	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
    30  	"github.com/dolthub/dolt/go/libraries/doltcore/sqle"
    31  	"github.com/dolthub/dolt/go/store/prolly"
    32  	"github.com/dolthub/dolt/go/store/prolly/tree"
    33  	"github.com/dolthub/dolt/go/store/types"
    34  	"github.com/dolthub/dolt/go/store/val"
    35  )
    36  
    37  func ValidateDatabase(ctx context.Context, db sql.Database) (err error) {
    38  	switch tdb := db.(type) {
    39  	case sqle.Database:
    40  		return ValidateDoltDatabase(ctx, tdb)
    41  	case mysql_db.PrivilegedDatabase:
    42  		return ValidateDatabase(ctx, tdb.Unwrap())
    43  	default:
    44  		return nil
    45  	}
    46  }
    47  
    48  func ValidateDoltDatabase(ctx context.Context, db sqle.Database) (err error) {
    49  	if !types.IsFormat_DOLT(db.GetDoltDB().Format()) {
    50  		return nil
    51  	}
    52  	for _, stage := range validationStages {
    53  		if err = stage(ctx, db); err != nil {
    54  			return err
    55  		}
    56  	}
    57  	return
    58  }
    59  
    60  type validator func(ctx context.Context, db sqle.Database) error
    61  
    62  var validationStages = []validator{
    63  	validateChunkReferences,
    64  	validateSecondaryIndexes,
    65  }
    66  
    67  // validateChunkReferences checks for dangling chunks.
    68  func validateChunkReferences(ctx context.Context, db sqle.Database) error {
    69  	validateIndex := func(ctx context.Context, idx durable.Index) error {
    70  		pm := durable.ProllyMapFromIndex(idx)
    71  		return pm.WalkNodes(ctx, func(ctx context.Context, nd tree.Node) error {
    72  			if nd.Size() <= 0 {
    73  				return fmt.Errorf("encountered nil tree.Node")
    74  			}
    75  			return nil
    76  		})
    77  	}
    78  
    79  	cb := func(n string, t *doltdb.Table, sch schema.Schema) (stop bool, err error) {
    80  		if sch == nil {
    81  			return true, fmt.Errorf("expected non-nil schema: %v", sch)
    82  		}
    83  
    84  		rows, err := t.GetRowData(ctx)
    85  		if err != nil {
    86  			return true, err
    87  		}
    88  		if err = validateIndex(ctx, rows); err != nil {
    89  			return true, err
    90  		}
    91  
    92  		indexes, err := t.GetIndexSet(ctx)
    93  		if err != nil {
    94  			return true, err
    95  		}
    96  		err = durable.IterAllIndexes(ctx, sch, indexes, func(_ string, idx durable.Index) error {
    97  			return validateIndex(ctx, idx)
    98  		})
    99  		if err != nil {
   100  			return true, err
   101  		}
   102  		return
   103  	}
   104  
   105  	return iterDatabaseTables(ctx, db, cb)
   106  }
   107  
   108  // validateSecondaryIndexes checks that secondary index contents are consistent
   109  // with primary index contents.
   110  func validateSecondaryIndexes(ctx context.Context, db sqle.Database) error {
   111  	cb := func(n string, t *doltdb.Table, sch schema.Schema) (stop bool, err error) {
   112  		rows, err := t.GetRowData(ctx)
   113  		if err != nil {
   114  			return false, err
   115  		}
   116  		primary := durable.ProllyMapFromIndex(rows)
   117  
   118  		for _, def := range sch.Indexes().AllIndexes() {
   119  			set, err := t.GetIndexSet(ctx)
   120  			if err != nil {
   121  				return true, err
   122  			}
   123  			idx, err := set.GetIndex(ctx, sch, def.Name())
   124  			if err != nil {
   125  				return true, err
   126  			}
   127  			secondary := durable.ProllyMapFromIndex(idx)
   128  
   129  			err = validateIndexConsistency(ctx, sch, def, primary, secondary)
   130  			if err != nil {
   131  				return true, err
   132  			}
   133  		}
   134  		return false, nil
   135  	}
   136  	return iterDatabaseTables(ctx, db, cb)
   137  }
   138  
   139  func validateIndexConsistency(
   140  	ctx context.Context,
   141  	sch schema.Schema,
   142  	def schema.Index,
   143  	primary, secondary prolly.Map,
   144  ) error {
   145  	if schema.IsKeyless(sch) {
   146  		return validateKeylessIndex(ctx, sch, def, primary, secondary)
   147  	} else {
   148  		return validatePkIndex(ctx, sch, def, primary, secondary)
   149  	}
   150  }
   151  
   152  // printIndexContents prints the contents of |prollyMap| to stdout. Intended for use debugging
   153  // index consistency issues.
   154  func printIndexContents(ctx context.Context, prollyMap prolly.Map) {
   155  	fmt.Printf("Secondary index contents:\n")
   156  	iterAll, _ := prollyMap.IterAll(ctx)
   157  	for {
   158  		k, _, err := iterAll.Next(ctx)
   159  		if err == io.EOF {
   160  			break
   161  		}
   162  		fmt.Printf("  - k: %v \n", k)
   163  	}
   164  }
   165  
   166  func validateKeylessIndex(ctx context.Context, sch schema.Schema, def schema.Index, primary, secondary prolly.Map) error {
   167  	// Full-Text indexes do not make use of their internal map, so we may safely skip this check
   168  	if def.IsFullText() {
   169  		return nil
   170  	}
   171  
   172  	// Indexes on virtual columns cannot be rebuilt via the method below
   173  	if isVirtualIndex(def, sch) {
   174  		return nil
   175  	}
   176  
   177  	secondary = prolly.ConvertToSecondaryKeylessIndex(secondary)
   178  	idxDesc, _ := secondary.Descriptors()
   179  	builder := val.NewTupleBuilder(idxDesc)
   180  	mapping := ordinalMappingsForSecondaryIndex(sch, def)
   181  	_, vd := primary.Descriptors()
   182  
   183  	iter, err := primary.IterAll(ctx)
   184  	if err != nil {
   185  		return err
   186  	}
   187  
   188  	for {
   189  		hashId, value, err := iter.Next(ctx)
   190  		if err == io.EOF {
   191  			return nil
   192  		}
   193  		if err != nil {
   194  			return err
   195  		}
   196  
   197  		// make secondary index key
   198  		for i := range mapping {
   199  			j := mapping.MapOrdinal(i)
   200  			// first field in |value| is cardinality
   201  			field := value.GetField(j + 1)
   202  
   203  			if shouldDereferenceContent(j+1, vd, i, idxDesc) {
   204  				field, err = dereferenceContent(ctx, vd, j+1, value, secondary.NodeStore())
   205  				if err != nil {
   206  					return err
   207  				}
   208  			} else if def.IsSpatial() {
   209  				geom, err := dereferenceGeometry(ctx, vd, j+1, value, secondary.NodeStore())
   210  				if err != nil {
   211  					return err
   212  				}
   213  				geom, _, err = sqltypes.GeometryType{}.Convert(geom)
   214  				if err != nil {
   215  					return err
   216  				}
   217  				cell := tree.ZCell(geom.(sqltypes.GeometryValue))
   218  				field = cell[:]
   219  			}
   220  
   221  			// Apply prefix lengths if they are configured
   222  			if len(def.PrefixLengths()) > i {
   223  				field = trimValueToPrefixLength(field, def.PrefixLengths()[i], vd.Types[j+1].Enc)
   224  			}
   225  
   226  			builder.PutRaw(i, field)
   227  		}
   228  		builder.PutRaw(idxDesc.Count()-1, hashId.GetField(0))
   229  		k := builder.Build(primary.Pool())
   230  
   231  		ok, err := secondary.Has(ctx, k)
   232  		if err != nil {
   233  			return err
   234  		}
   235  		if !ok {
   236  			printIndexContents(ctx, secondary)
   237  			return fmt.Errorf("index key %s not found in index %s", builder.Desc.Format(k), def.Name())
   238  		}
   239  	}
   240  }
   241  
   242  func validatePkIndex(ctx context.Context, sch schema.Schema, def schema.Index, primary, secondary prolly.Map) error {
   243  	// Full-Text indexes do not make use of their internal map, so we may safely skip this check
   244  	if def.IsFullText() {
   245  		return nil
   246  	}
   247  
   248  	// Indexes on virtual columns cannot be rebuilt via the method below
   249  	if isVirtualIndex(def, sch) {
   250  		return nil
   251  	}
   252  
   253  	// secondary indexes have empty values
   254  	idxDesc, _ := secondary.Descriptors()
   255  	builder := val.NewTupleBuilder(idxDesc)
   256  	mapping := ordinalMappingsForSecondaryIndex(sch, def)
   257  	kd, vd := primary.Descriptors()
   258  
   259  	// Before we walk through the primary index data and validate that every row in the primary index exists in the
   260  	// secondary index, we also check that the primary index and secondary index have the same number of rows.
   261  	// Otherwise, we won't catch if the secondary index has extra, bogus data in it.
   262  	totalSecondaryCount, err := secondary.Count()
   263  	if err != nil {
   264  		return err
   265  	}
   266  	totalPrimaryCount, err := primary.Count()
   267  	if err != nil {
   268  		return err
   269  	}
   270  	if totalSecondaryCount != totalPrimaryCount {
   271  		return fmt.Errorf("primary index row count (%d) does not match secondary index row count (%d)",
   272  			totalPrimaryCount, totalSecondaryCount)
   273  	}
   274  
   275  	pkSize := kd.Count()
   276  	iter, err := primary.IterAll(ctx)
   277  	if err != nil {
   278  		return err
   279  	}
   280  
   281  	for {
   282  		key, value, err := iter.Next(ctx)
   283  		if err == io.EOF {
   284  			return nil
   285  		}
   286  		if err != nil {
   287  			return err
   288  		}
   289  
   290  		// make secondary index key
   291  		for i := range mapping {
   292  			j := mapping.MapOrdinal(i)
   293  			if j < pkSize {
   294  				builder.PutRaw(i, key.GetField(j))
   295  			} else {
   296  				field := value.GetField(j - pkSize)
   297  
   298  				if shouldDereferenceContent(j-pkSize, vd, i, idxDesc) {
   299  					field, err = dereferenceContent(ctx, vd, j-pkSize, value, secondary.NodeStore())
   300  					if err != nil {
   301  						return err
   302  					}
   303  				} else if def.IsSpatial() {
   304  					geom, err := dereferenceGeometry(ctx, vd, j-pkSize, value, secondary.NodeStore())
   305  					if err != nil {
   306  						return err
   307  					}
   308  					geom, _, err = sqltypes.GeometryType{}.Convert(geom)
   309  					if err != nil {
   310  						return err
   311  					}
   312  					cell := tree.ZCell(geom.(sqltypes.GeometryValue))
   313  					field = cell[:]
   314  				}
   315  
   316  				// Apply prefix lengths if they are configured
   317  				if len(def.PrefixLengths()) > i {
   318  					field = trimValueToPrefixLength(field, def.PrefixLengths()[i], vd.Types[j-pkSize].Enc)
   319  				}
   320  
   321  				builder.PutRaw(i, field)
   322  			}
   323  		}
   324  		k := builder.Build(primary.Pool())
   325  
   326  		ok, err := secondary.Has(ctx, k)
   327  		if err != nil {
   328  			return err
   329  		}
   330  		if !ok {
   331  			printIndexContents(ctx, secondary)
   332  			return fmt.Errorf("index key %v not found in index %s", builder.Desc.Format(k), def.Name())
   333  		}
   334  	}
   335  }
   336  
   337  func isVirtualIndex(def schema.Index, sch schema.Schema) bool {
   338  	for _, colName := range def.ColumnNames() {
   339  		col, ok := sch.GetAllCols().GetByName(colName)
   340  		if !ok {
   341  			panic(fmt.Sprintf("column not found: %s", colName))
   342  		}
   343  		if col.Virtual {
   344  			return true
   345  		}
   346  	}
   347  	return false
   348  }
   349  
   350  // shouldDereferenceContent returns true if address encoded content should be dereferenced when
   351  // building a key for a secondary index. This is determined by looking at the encoding of the field
   352  // in the main table (|tablePos| and |tableValueDescriptor|) and the encoding of the field in the index
   353  // (|indexPos| and |indexKeyDescriptor|) and seeing if one is an address encoding and the other is not.
   354  func shouldDereferenceContent(tablePos int, tableValueDescriptor val.TupleDesc, indexPos int, indexKeyDescriptor val.TupleDesc) bool {
   355  	if tableValueDescriptor.Types[tablePos].Enc == val.StringAddrEnc && indexKeyDescriptor.Types[indexPos].Enc != val.StringAddrEnc {
   356  		return true
   357  	}
   358  
   359  	if tableValueDescriptor.Types[tablePos].Enc == val.BytesAddrEnc && indexKeyDescriptor.Types[indexPos].Enc != val.BytesAddrEnc {
   360  		return true
   361  	}
   362  
   363  	return false
   364  }
   365  
   366  // dereferenceContent dereferences an address encoded field (e.g. TEXT, BLOB) to load the content
   367  // and return a []byte. |tableValueDescriptor| is the tuple descriptor for the value tuple of the main
   368  // table, |tablePos| is the field index into the value tuple, and |tuple| is the value tuple from the
   369  // main table.
   370  func dereferenceContent(ctx context.Context, tableValueDescriptor val.TupleDesc, tablePos int, tuple val.Tuple, ns tree.NodeStore) ([]byte, error) {
   371  	v, err := tree.GetField(ctx, tableValueDescriptor, tablePos, tuple, ns)
   372  	if err != nil {
   373  		return nil, err
   374  	}
   375  	if v == nil {
   376  		return nil, nil
   377  	}
   378  
   379  	switch x := v.(type) {
   380  	case string:
   381  		return []byte(x), nil
   382  	case []byte:
   383  		return x, nil
   384  	default:
   385  		return nil, fmt.Errorf("unexpected type for address encoded content: %T", v)
   386  	}
   387  }
   388  
   389  // dereferenceGeometry dereferences an address encoded geometry field to load the content
   390  // and return a GeometryType. |tableValueDescriptor| is the tuple descriptor for the value tuple of the main
   391  // table, |tablePos| is the field index into the value tuple, and |tuple| is the value tuple from the
   392  // main table.
   393  func dereferenceGeometry(ctx context.Context, tableValueDescriptor val.TupleDesc, tablePos int, tuple val.Tuple, ns tree.NodeStore) (interface{}, error) {
   394  	v, err := tree.GetField(ctx, tableValueDescriptor, tablePos, tuple, ns)
   395  	if err != nil {
   396  		return nil, err
   397  	}
   398  	if v == nil {
   399  		return nil, nil
   400  	}
   401  
   402  	switch x := v.(type) {
   403  	case string:
   404  		return []byte(x), nil
   405  	case []byte:
   406  		return x, nil
   407  	case sqltypes.Point, sqltypes.LineString, sqltypes.Polygon, sqltypes.MultiPoint, sqltypes.MultiLineString, sqltypes.MultiPolygon, sqltypes.GeometryType, sqltypes.GeomColl:
   408  		return x, nil
   409  	default:
   410  		return nil, fmt.Errorf("unexpected type for address encoded content: %T", v)
   411  	}
   412  }
   413  
   414  // trimValueToPrefixLength trims |value| by truncating the bytes after |prefixLength|. If |prefixLength|
   415  // is zero or if |value| is nil, then no trimming is done and |value| is directly returned. The
   416  // |encoding| param indicates the original encoding of |value| in the source table.
   417  func trimValueToPrefixLength(value []byte, prefixLength uint16, encoding val.Encoding) []byte {
   418  	if value == nil || prefixLength == 0 {
   419  		return value
   420  	}
   421  
   422  	if uint16(len(value)) < prefixLength {
   423  		prefixLength = uint16(len(value))
   424  	}
   425  
   426  	addTerminatingNullByte := false
   427  	if encoding == val.BytesAddrEnc || encoding == val.StringAddrEnc {
   428  		// If the original encoding was for a BLOB or TEXT field, then we need to add
   429  		// a null byte at the end of the prefix to get it into StringEnc format.
   430  		addTerminatingNullByte = true
   431  	} else if prefixLength < uint16(len(value)) {
   432  		// Otherwise, if we're trimming a StringEnc value, we also need to re-add the
   433  		// null terminating byte.
   434  		addTerminatingNullByte = true
   435  	}
   436  
   437  	newValue := make([]byte, prefixLength)
   438  	copy(newValue, value[:prefixLength])
   439  	if addTerminatingNullByte {
   440  		newValue = append(newValue, byte(0))
   441  	}
   442  
   443  	return newValue
   444  }
   445  
   446  func ordinalMappingsForSecondaryIndex(sch schema.Schema, def schema.Index) (ord val.OrdinalMapping) {
   447  	// assert empty values for secondary indexes
   448  	if def.Schema().GetNonPKCols().Size() > 0 {
   449  		panic("expected empty secondary index values")
   450  	}
   451  
   452  	secondary := def.Schema().GetPKCols()
   453  	ord = make(val.OrdinalMapping, secondary.Size())
   454  
   455  	for i := range ord {
   456  		name := secondary.GetByIndex(i).Name
   457  		ord[i] = -1
   458  
   459  		pks := sch.GetPKCols().GetColumns()
   460  		for j, col := range pks {
   461  			if col.Name == name {
   462  				ord[i] = j
   463  			}
   464  		}
   465  		vals := sch.GetNonPKCols().GetColumns()
   466  		for _, col := range vals {
   467  			if col.Name == name {
   468  				storedIdx, ok := sch.GetNonPKCols().StoredIndexByTag(col.Tag)
   469  				if !ok {
   470  					panic("column " + name + " not found")
   471  				}
   472  				ord[i] = storedIdx + len(pks)
   473  			}
   474  		}
   475  		if ord[i] < 0 {
   476  			panic("column " + name + " not found")
   477  		}
   478  	}
   479  	return
   480  }
   481  
   482  // iterDatabaseTables is a utility to factor out common validation access patterns.
   483  func iterDatabaseTables(
   484  	ctx context.Context,
   485  	db sqle.Database,
   486  	cb func(name string, t *doltdb.Table, sch schema.Schema) (bool, error),
   487  ) error {
   488  	ddb := db.GetDoltDB()
   489  	branches, err := ddb.GetBranches(ctx)
   490  	if err != nil {
   491  		return err
   492  	}
   493  
   494  	for _, branchRef := range branches {
   495  		wsRef, err := ref.WorkingSetRefForHead(branchRef)
   496  		if err != nil {
   497  			return err
   498  		}
   499  		ws, err := ddb.ResolveWorkingSet(ctx, wsRef)
   500  		if err != nil {
   501  			return err
   502  		}
   503  
   504  		r := ws.WorkingRoot()
   505  
   506  		if err = r.IterTables(ctx, cb); err != nil {
   507  			return err
   508  		}
   509  	}
   510  	return nil
   511  }