github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/show_fingerprints.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"strings"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/security"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/catalog/resolver"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/privilege"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    24  	"github.com/cockroachdb/errors"
    25  )
    26  
    27  type showFingerprintsNode struct {
    28  	optColumnsSlot
    29  
    30  	tableDesc *sqlbase.ImmutableTableDescriptor
    31  	indexes   []*sqlbase.IndexDescriptor
    32  
    33  	run showFingerprintsRun
    34  }
    35  
    36  // ShowFingerprints statement fingerprints the data in each index of a table.
    37  // For each index, a full index scan is run to hash every row with the fnv64
    38  // hash. For the primary index, all table columns are included in the hash,
    39  // whereas for secondary indexes, the index cols + the primary index cols + the
    40  // STORING cols are included. The hashed rows are all combined with XOR using
    41  // distsql.
    42  //
    43  // Our hash functions expect input of type BYTES (or string but we use bytes
    44  // here), so we have to convert any datums that are not BYTES. This is currently
    45  // done by round tripping through the string representation of the column
    46  // (`::string::bytes`) and is an obvious area for improvement in the next
    47  // version.
    48  //
    49  // To extract the fingerprints at some point in the past, the following
    50  // query can be used:
    51  //    SELECT * FROM [SHOW EXPERIMENTAL_FINGERPRINTS FROM TABLE foo] AS OF SYSTEM TIME xxx
    52  func (p *planner) ShowFingerprints(
    53  	ctx context.Context, n *tree.ShowFingerprints,
    54  ) (planNode, error) {
    55  	// We avoid the cache so that we can observe the fingerprints without
    56  	// taking a lease, like other SHOW commands.
    57  	tableDesc, err := p.ResolveUncachedTableDescriptorEx(
    58  		ctx, n.Table, true /*required*/, resolver.ResolveRequireTableDesc)
    59  	if err != nil {
    60  		return nil, err
    61  	}
    62  
    63  	if err := p.CheckPrivilege(ctx, tableDesc, privilege.SELECT); err != nil {
    64  		return nil, err
    65  	}
    66  
    67  	return &showFingerprintsNode{
    68  		tableDesc: tableDesc,
    69  		indexes:   tableDesc.AllNonDropIndexes(),
    70  	}, nil
    71  }
    72  
    73  // showFingerprintsRun contains the run-time state of
    74  // showFingerprintsNode during local execution.
    75  type showFingerprintsRun struct {
    76  	rowIdx int
    77  	// values stores the current row, updated by Next().
    78  	values []tree.Datum
    79  }
    80  
    81  func (n *showFingerprintsNode) startExec(params runParams) error {
    82  	n.run.values = []tree.Datum{tree.DNull, tree.DNull}
    83  	return nil
    84  }
    85  
    86  func (n *showFingerprintsNode) Next(params runParams) (bool, error) {
    87  	if n.run.rowIdx >= len(n.indexes) {
    88  		return false, nil
    89  	}
    90  	index := n.indexes[n.run.rowIdx]
    91  
    92  	cols := make([]string, 0, len(n.tableDesc.Columns))
    93  	addColumn := func(col *sqlbase.ColumnDescriptor) {
    94  		// TODO(dan): This is known to be a flawed way to fingerprint. Any datum
    95  		// with the same string representation is fingerprinted the same, even
    96  		// if they're different types.
    97  		switch col.Type.Family() {
    98  		case types.BytesFamily:
    99  			cols = append(cols, fmt.Sprintf("%s:::bytes", tree.NameStringP(&col.Name)))
   100  		default:
   101  			cols = append(cols, fmt.Sprintf("%s::string::bytes", tree.NameStringP(&col.Name)))
   102  		}
   103  	}
   104  
   105  	if index.ID == n.tableDesc.PrimaryIndex.ID {
   106  		for i := range n.tableDesc.Columns {
   107  			addColumn(&n.tableDesc.Columns[i])
   108  		}
   109  	} else {
   110  		colsByID := make(map[sqlbase.ColumnID]*sqlbase.ColumnDescriptor)
   111  		for i := range n.tableDesc.Columns {
   112  			col := &n.tableDesc.Columns[i]
   113  			colsByID[col.ID] = col
   114  		}
   115  		colIDs := append(append(index.ColumnIDs, index.ExtraColumnIDs...), index.StoreColumnIDs...)
   116  		for _, colID := range colIDs {
   117  			addColumn(colsByID[colID])
   118  		}
   119  	}
   120  
   121  	// The fnv64 hash was chosen mostly due to speed. I did an AS OF SYSTEM TIME
   122  	// fingerprint over 31GiB on a 4 node production cluster (with no other
   123  	// traffic to try and keep things comparable). The cluster was restarted in
   124  	// between each run. Resulting times:
   125  	//
   126  	//  fnv => 17m
   127  	//  sha512 => 1h6m
   128  	//  sha265 => 1h6m
   129  	//  fnv64 (again) => 17m
   130  	//
   131  	// TODO(dan): If/when this ever loses its EXPERIMENTAL prefix and gets
   132  	// exposed to users, consider adding a version to the fingerprint output.
   133  	sql := fmt.Sprintf(`SELECT
   134  	  xor_agg(fnv64(%s))::string AS fingerprint
   135  	  FROM [%d AS t]@{FORCE_INDEX=[%d]}
   136  	`, strings.Join(cols, `,`), n.tableDesc.ID, index.ID)
   137  	// If were'in in an AOST context, propagate it to the inner statement so that
   138  	// the inner statement gets planned with planner.avoidCachedDescriptors set,
   139  	// like the outter one.
   140  	if params.p.semaCtx.AsOfTimestamp != nil {
   141  		ts := params.p.txn.ReadTimestamp()
   142  		sql = sql + " AS OF SYSTEM TIME " + ts.AsOfSystemTime()
   143  	}
   144  
   145  	fingerprintCols, err := params.extendedEvalCtx.ExecCfg.InternalExecutor.QueryRowEx(
   146  		params.ctx, "hash-fingerprint",
   147  		params.p.txn,
   148  		sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   149  		sql,
   150  	)
   151  	if err != nil {
   152  		return false, err
   153  	}
   154  
   155  	if len(fingerprintCols) != 1 {
   156  		return false, errors.AssertionFailedf(
   157  			"unexpected number of columns returned: 1 vs %d",
   158  			len(fingerprintCols))
   159  	}
   160  	fingerprint := fingerprintCols[0]
   161  
   162  	n.run.values[0] = tree.NewDString(index.Name)
   163  	n.run.values[1] = fingerprint
   164  	n.run.rowIdx++
   165  	return true, nil
   166  }
   167  
   168  func (n *showFingerprintsNode) Values() tree.Datums     { return n.run.values }
   169  func (n *showFingerprintsNode) Close(_ context.Context) {}