github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/app_stats.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"crypto/hmac"
    17  	"crypto/sha256"
    18  	"encoding/hex"
    19  	"encoding/json"
    20  	"fmt"
    21  	"strings"
    22  	"time"
    23  
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    26  	"github.com/cockroachdb/cockroach/pkg/settings"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/parser"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    31  	"github.com/cockroachdb/cockroach/pkg/util/log"
    32  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    33  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    34  	"github.com/cockroachdb/errors"
    35  )
    36  
    37  type stmtKey struct {
    38  	stmt        string
    39  	failed      bool
    40  	distSQLUsed bool
    41  	implicitTxn bool
    42  }
    43  
    44  // appStats holds per-application statistics.
    45  type appStats struct {
    46  	syncutil.Mutex
    47  
    48  	st    *cluster.Settings
    49  	stmts map[stmtKey]*stmtStats
    50  	txns  transactionStats
    51  }
    52  
    53  // stmtStats holds per-statement statistics.
    54  type stmtStats struct {
    55  	syncutil.Mutex
    56  
    57  	data roachpb.StatementStatistics
    58  }
    59  
    60  // transactionStats holds per-application transaction statistics.
    61  type transactionStats struct {
    62  	mu struct {
    63  		syncutil.Mutex
    64  		roachpb.TxnStats
    65  	}
    66  }
    67  
    68  // stmtStatsEnable determines whether to collect per-statement
    69  // statistics.
    70  var stmtStatsEnable = settings.RegisterPublicBoolSetting(
    71  	"sql.metrics.statement_details.enabled", "collect per-statement query statistics", true,
    72  )
    73  
    74  // txnStatsEnable determines whether to collect per-application transaction
    75  // statistics.
    76  var txnStatsEnable = settings.RegisterPublicBoolSetting(
    77  	"sql.metrics.transaction_details.enabled", "collect per-application transaction statistics", true,
    78  )
    79  
    80  // sqlStatsCollectionLatencyThreshold specifies the minimum amount of time
    81  // consumed by a SQL statement before it is collected for statistics reporting.
    82  var sqlStatsCollectionLatencyThreshold = settings.RegisterPublicDurationSetting(
    83  	"sql.metrics.statement_details.threshold",
    84  	"minimum execution time to cause statistics to be collected",
    85  	0,
    86  )
    87  
    88  var dumpStmtStatsToLogBeforeReset = settings.RegisterPublicBoolSetting(
    89  	"sql.metrics.statement_details.dump_to_logs",
    90  	"dump collected statement statistics to node logs when periodically cleared",
    91  	false,
    92  )
    93  
    94  var sampleLogicalPlans = settings.RegisterPublicBoolSetting(
    95  	"sql.metrics.statement_details.plan_collection.enabled",
    96  	"periodically save a logical plan for each fingerprint",
    97  	true,
    98  )
    99  
   100  var logicalPlanCollectionPeriod = settings.RegisterPublicNonNegativeDurationSetting(
   101  	"sql.metrics.statement_details.plan_collection.period",
   102  	"the time until a new logical plan is collected",
   103  	5*time.Minute,
   104  )
   105  
   106  func (s stmtKey) String() string {
   107  	return s.flags() + s.stmt
   108  }
   109  
   110  func (s stmtKey) flags() string {
   111  	var b bytes.Buffer
   112  	if s.failed {
   113  		b.WriteByte('!')
   114  	}
   115  	if s.distSQLUsed {
   116  		b.WriteByte('+')
   117  	}
   118  	return b.String()
   119  }
   120  
   121  // recordStatement saves per-statement statistics.
   122  //
   123  // samplePlanDescription can be nil, as these are only sampled periodically per unique fingerprint.
   124  func (a *appStats) recordStatement(
   125  	stmt *Statement,
   126  	samplePlanDescription *roachpb.ExplainTreePlanNode,
   127  	distSQLUsed bool,
   128  	implicitTxn bool,
   129  	automaticRetryCount int,
   130  	numRows int,
   131  	err error,
   132  	parseLat, planLat, runLat, svcLat, ovhLat float64,
   133  	bytesRead, rowsRead int64,
   134  ) {
   135  	if !stmtStatsEnable.Get(&a.st.SV) {
   136  		return
   137  	}
   138  
   139  	if t := sqlStatsCollectionLatencyThreshold.Get(&a.st.SV); t > 0 && t.Seconds() >= svcLat {
   140  		return
   141  	}
   142  
   143  	// Get the statistics object.
   144  	s := a.getStatsForStmt(stmt, distSQLUsed, implicitTxn, err, true /* createIfNonexistent */)
   145  
   146  	// Collect the per-statement statistics.
   147  	s.Lock()
   148  	s.data.Count++
   149  	if err != nil {
   150  		s.data.SensitiveInfo.LastErr = err.Error()
   151  	}
   152  	// Only update MostRecentPlanDescription if we sampled a new PlanDescription.
   153  	if samplePlanDescription != nil {
   154  		s.data.SensitiveInfo.MostRecentPlanDescription = *samplePlanDescription
   155  		s.data.SensitiveInfo.MostRecentPlanTimestamp = timeutil.Now()
   156  	}
   157  	if automaticRetryCount == 0 {
   158  		s.data.FirstAttemptCount++
   159  	} else if int64(automaticRetryCount) > s.data.MaxRetries {
   160  		s.data.MaxRetries = int64(automaticRetryCount)
   161  	}
   162  	s.data.NumRows.Record(s.data.Count, float64(numRows))
   163  	s.data.ParseLat.Record(s.data.Count, parseLat)
   164  	s.data.PlanLat.Record(s.data.Count, planLat)
   165  	s.data.RunLat.Record(s.data.Count, runLat)
   166  	s.data.ServiceLat.Record(s.data.Count, svcLat)
   167  	s.data.OverheadLat.Record(s.data.Count, ovhLat)
   168  	s.data.BytesRead = bytesRead
   169  	s.data.RowsRead = rowsRead
   170  	s.Unlock()
   171  }
   172  
   173  // getStatsForStmt retrieves the per-stmt stat object.
   174  func (a *appStats) getStatsForStmt(
   175  	stmt *Statement, distSQLUsed bool, implicitTxn bool, err error, createIfNonexistent bool,
   176  ) *stmtStats {
   177  	// Extend the statement key with various characteristics, so
   178  	// that we use separate buckets for the different situations.
   179  	key := stmtKey{failed: err != nil, distSQLUsed: distSQLUsed, implicitTxn: implicitTxn}
   180  	if stmt.AnonymizedStr != "" {
   181  		// Use the cached anonymized string.
   182  		key.stmt = stmt.AnonymizedStr
   183  	} else {
   184  		key.stmt = anonymizeStmt(stmt.AST)
   185  	}
   186  
   187  	return a.getStatsForStmtWithKey(key, createIfNonexistent)
   188  }
   189  
   190  func (a *appStats) getStatsForStmtWithKey(key stmtKey, createIfNonexistent bool) *stmtStats {
   191  	a.Lock()
   192  	// Retrieve the per-statement statistic object, and create it if it
   193  	// doesn't exist yet.
   194  	s, ok := a.stmts[key]
   195  	if !ok && createIfNonexistent {
   196  		s = &stmtStats{}
   197  		a.stmts[key] = s
   198  	}
   199  	a.Unlock()
   200  	return s
   201  }
   202  
   203  // Add combines one appStats into another. Add manages locks on a, so taking
   204  // a lock on a will cause a deadlock.
   205  func (a *appStats) Add(other *appStats) {
   206  	other.Lock()
   207  	statMap := make(map[stmtKey]*stmtStats)
   208  	for k, v := range other.stmts {
   209  		statMap[k] = v
   210  	}
   211  	other.Unlock()
   212  
   213  	// Copy the statement stats for each statement key.
   214  	for k, v := range statMap {
   215  		v.Lock()
   216  		statCopy := &stmtStats{data: v.data}
   217  		v.Unlock()
   218  		statMap[k] = statCopy
   219  	}
   220  
   221  	// Merge the statement stats.
   222  	for k, v := range statMap {
   223  		s := a.getStatsForStmtWithKey(k, true)
   224  		s.Lock()
   225  		// Note that we don't need to take a lock on v because
   226  		// no other thread knows about v yet.
   227  		s.data.Add(&v.data)
   228  		s.Unlock()
   229  	}
   230  
   231  	// Create a copy of the other's transactions statistics.
   232  	other.txns.mu.Lock()
   233  	txnStats := other.txns.mu.TxnStats
   234  	other.txns.mu.Unlock()
   235  
   236  	// Merge the transaction stats.
   237  	a.txns.mu.Lock()
   238  	a.txns.mu.TxnStats.Add(txnStats)
   239  	a.txns.mu.Unlock()
   240  }
   241  
   242  func anonymizeStmt(ast tree.Statement) string {
   243  	return tree.AsStringWithFlags(ast, tree.FmtHideConstants)
   244  }
   245  
   246  func (s *transactionStats) getStats() (
   247  	txnCount int64,
   248  	txnTimeAvg float64,
   249  	txnTimeVar float64,
   250  	committedCount int64,
   251  	implicitCount int64,
   252  ) {
   253  	s.mu.Lock()
   254  	defer s.mu.Unlock()
   255  	txnCount = s.mu.TxnCount
   256  	txnTimeAvg = s.mu.TxnTimeSec.Mean
   257  	txnTimeVar = s.mu.TxnTimeSec.GetVariance(txnCount)
   258  	committedCount = s.mu.CommittedCount
   259  	implicitCount = s.mu.ImplicitCount
   260  	return txnCount, txnTimeAvg, txnTimeVar, committedCount, implicitCount
   261  }
   262  
   263  func (s *transactionStats) recordTransaction(txnTimeSec float64, ev txnEvent, implicit bool) {
   264  	s.mu.Lock()
   265  	defer s.mu.Unlock()
   266  	s.mu.TxnCount++
   267  	s.mu.TxnTimeSec.Record(s.mu.TxnCount, txnTimeSec)
   268  	if ev == txnCommit {
   269  		s.mu.CommittedCount++
   270  	}
   271  	if implicit {
   272  		s.mu.ImplicitCount++
   273  	}
   274  }
   275  
   276  func (a *appStats) recordTransaction(txnTimeSec float64, ev txnEvent, implicit bool) {
   277  	if !txnStatsEnable.Get(&a.st.SV) {
   278  		return
   279  	}
   280  	a.txns.recordTransaction(txnTimeSec, ev, implicit)
   281  }
   282  
   283  // shouldSaveLogicalPlanDescription returns whether we should save this as a
   284  // sample logical plan for its corresponding fingerprint. We use
   285  // `logicalPlanCollectionPeriod` to assess how frequently to sample logical
   286  // plans.
   287  func (a *appStats) shouldSaveLogicalPlanDescription(
   288  	stmt *Statement, useDistSQL bool, implicitTxn bool, err error,
   289  ) bool {
   290  	if !sampleLogicalPlans.Get(&a.st.SV) {
   291  		return false
   292  	}
   293  	stats := a.getStatsForStmt(stmt, useDistSQL, implicitTxn, err, false /* createIfNonexistent */)
   294  	if stats == nil {
   295  		// Save logical plan the first time we see new statement fingerprint.
   296  		return true
   297  	}
   298  	now := timeutil.Now()
   299  	period := logicalPlanCollectionPeriod.Get(&a.st.SV)
   300  	stats.Lock()
   301  	defer stats.Unlock()
   302  	timeLastSampled := stats.data.SensitiveInfo.MostRecentPlanTimestamp
   303  	return now.Sub(timeLastSampled) >= period
   304  }
   305  
   306  // sqlStats carries per-application statistics for all applications.
   307  type sqlStats struct {
   308  	syncutil.Mutex
   309  
   310  	st *cluster.Settings
   311  	// lastReset is the time at which the app containers were reset.
   312  	lastReset time.Time
   313  	// apps is the container for all the per-application statistics objects.
   314  	apps map[string]*appStats
   315  }
   316  
   317  func (s *sqlStats) getStatsForApplication(appName string) *appStats {
   318  	s.Lock()
   319  	defer s.Unlock()
   320  	if a, ok := s.apps[appName]; ok {
   321  		return a
   322  	}
   323  	a := &appStats{
   324  		st:    s.st,
   325  		stmts: make(map[stmtKey]*stmtStats),
   326  	}
   327  	s.apps[appName] = a
   328  	return a
   329  }
   330  
   331  // resetAndMaybeDumpStats clears all the stored per-app and per-statement
   332  // statistics. If target s not nil, then the stats in s will be flushed
   333  // into target.
   334  func (s *sqlStats) resetAndMaybeDumpStats(ctx context.Context, target *sqlStats) {
   335  	// Note: we do not clear the entire s.apps map here. We would need
   336  	// to do so to prevent problems with a runaway client running `SET
   337  	// APPLICATION_NAME=...` with a different name every time.  However,
   338  	// any ongoing open client session at the time of the reset has
   339  	// cached a pointer to its appStats struct and would thus continue
   340  	// to report its stats in an object now invisible to the target tools
   341  	// (virtual table, marshaling, etc.). It's a judgement call, but
   342  	// for now we prefer to see more data and thus not clear the map, at
   343  	// the risk of seeing the map grow unboundedly with the number of
   344  	// different application_names seen so far.
   345  
   346  	// appStatsCopy will hold a snapshot of the stats being cleared
   347  	// to dump into target.
   348  	var appStatsCopy map[string]*appStats
   349  
   350  	s.Lock()
   351  
   352  	if target != nil {
   353  		appStatsCopy = make(map[string]*appStats, len(s.apps))
   354  	}
   355  
   356  	// Clear the per-apps maps manually,
   357  	// because any SQL session currently open has cached the
   358  	// pointer to its appStats object and will continue to
   359  	// accumulate data using that until it closes (or changes its
   360  	// application_name).
   361  	for appName, a := range s.apps {
   362  		a.Lock()
   363  
   364  		// Save the existing data to logs.
   365  		// TODO(knz/dt): instead of dumping the stats to the log, save
   366  		// them in a SQL table so they can be inspected by the DBA and/or
   367  		// the UI.
   368  		if dumpStmtStatsToLogBeforeReset.Get(&a.st.SV) {
   369  			dumpStmtStats(ctx, appName, a.stmts)
   370  		}
   371  
   372  		// Only save a copy of a if we need to dump a copy of the stats.
   373  		if target != nil {
   374  			aCopy := &appStats{st: a.st, stmts: a.stmts}
   375  			appStatsCopy[appName] = aCopy
   376  		}
   377  
   378  		// Clear the map, to release the memory; make the new map somewhat already
   379  		// large for the likely future workload.
   380  		a.stmts = make(map[stmtKey]*stmtStats, len(a.stmts)/2)
   381  		a.Unlock()
   382  	}
   383  	s.lastReset = timeutil.Now()
   384  	s.Unlock()
   385  
   386  	// Dump the copied stats into target.
   387  	if target != nil {
   388  		for k, v := range appStatsCopy {
   389  			stats := target.getStatsForApplication(k)
   390  			// Add manages locks for itself, so we don't need to guard it with locks.
   391  			stats.Add(v)
   392  		}
   393  	}
   394  }
   395  
   396  func (s *sqlStats) getLastReset() time.Time {
   397  	s.Lock()
   398  	defer s.Unlock()
   399  	return s.lastReset
   400  }
   401  
   402  // Save the existing data for an application to the info log.
   403  func dumpStmtStats(ctx context.Context, appName string, stats map[stmtKey]*stmtStats) {
   404  	if len(stats) == 0 {
   405  		return
   406  	}
   407  	var buf bytes.Buffer
   408  	for key, s := range stats {
   409  		s.Lock()
   410  		json, err := json.Marshal(s.data)
   411  		s.Unlock()
   412  		if err != nil {
   413  			log.Errorf(ctx, "error while marshaling stats for %q // %q: %v", appName, key.String(), err)
   414  			continue
   415  		}
   416  		fmt.Fprintf(&buf, "%q: %s\n", key.String(), json)
   417  	}
   418  	log.Infof(ctx, "Statistics for %q:\n%s", appName, buf.String())
   419  }
   420  
   421  func scrubStmtStatKey(vt VirtualTabler, key string) (string, bool) {
   422  	// Re-parse the statement to obtain its AST.
   423  	stmt, err := parser.ParseOne(key)
   424  	if err != nil {
   425  		return "", false
   426  	}
   427  
   428  	// Re-format to remove most names.
   429  	f := tree.NewFmtCtx(tree.FmtAnonymize)
   430  
   431  	reformatFn := func(ctx *tree.FmtCtx, tn *tree.TableName) {
   432  		virtual, err := vt.getVirtualTableEntry(tn)
   433  		if err != nil || virtual.desc == nil {
   434  			ctx.WriteByte('_')
   435  			return
   436  		}
   437  		// Virtual table: we want to keep the name; however
   438  		// we need to scrub the database name prefix.
   439  		newTn := *tn
   440  		newTn.CatalogName = "_"
   441  
   442  		ctx.WithFlags(tree.FmtParsable, func() {
   443  			ctx.WithReformatTableNames(nil, func() {
   444  				ctx.FormatNode(&newTn)
   445  			})
   446  		})
   447  	}
   448  	f.SetReformatTableNames(reformatFn)
   449  	f.FormatNode(stmt.AST)
   450  	return f.CloseAndGetString(), true
   451  }
   452  
   453  func (s *sqlStats) getScrubbedStmtStats(
   454  	vt *VirtualSchemaHolder,
   455  ) []roachpb.CollectedStatementStatistics {
   456  	return s.getStmtStats(vt, true /* scrub */)
   457  }
   458  
   459  func (s *sqlStats) getUnscrubbedStmtStats(
   460  	vt *VirtualSchemaHolder,
   461  ) []roachpb.CollectedStatementStatistics {
   462  	return s.getStmtStats(vt, false /* scrub */)
   463  }
   464  
   465  func (s *sqlStats) getStmtStats(
   466  	vt *VirtualSchemaHolder, scrub bool,
   467  ) []roachpb.CollectedStatementStatistics {
   468  	s.Lock()
   469  	defer s.Unlock()
   470  	var ret []roachpb.CollectedStatementStatistics
   471  	salt := ClusterSecret.Get(&s.st.SV)
   472  	for appName, a := range s.apps {
   473  		a.Lock()
   474  		if cap(ret) == 0 {
   475  			// guesstimate that we'll need apps*(queries-per-app).
   476  			ret = make([]roachpb.CollectedStatementStatistics, 0, len(a.stmts)*len(s.apps))
   477  		}
   478  		for q, stats := range a.stmts {
   479  			maybeScrubbed := q.stmt
   480  			maybeHashedAppName := appName
   481  			ok := true
   482  			if scrub {
   483  				maybeScrubbed, ok = scrubStmtStatKey(vt, q.stmt)
   484  				if !strings.HasPrefix(appName, sqlbase.ReportableAppNamePrefix) {
   485  					maybeHashedAppName = HashForReporting(salt, appName)
   486  				}
   487  			}
   488  			if ok {
   489  				k := roachpb.StatementStatisticsKey{
   490  					Query:       maybeScrubbed,
   491  					DistSQL:     q.distSQLUsed,
   492  					Opt:         true,
   493  					ImplicitTxn: q.implicitTxn,
   494  					Failed:      q.failed,
   495  					App:         maybeHashedAppName,
   496  				}
   497  				stats.Lock()
   498  				data := stats.data
   499  				stats.Unlock()
   500  
   501  				if scrub {
   502  					// Quantize the counts to avoid leaking information that way.
   503  					quantizeCounts(&data)
   504  					data.SensitiveInfo = data.SensitiveInfo.GetScrubbedCopy()
   505  				}
   506  
   507  				ret = append(ret, roachpb.CollectedStatementStatistics{Key: k, Stats: data})
   508  			}
   509  		}
   510  		a.Unlock()
   511  	}
   512  	return ret
   513  }
   514  
   515  // quantizeCounts ensures that the counts are bucketed into "simple" values.
   516  func quantizeCounts(d *roachpb.StatementStatistics) {
   517  	oldCount := d.Count
   518  	newCount := telemetry.Bucket10(oldCount)
   519  	d.Count = newCount
   520  	// The SquaredDiffs values are meant to enable computing the variance
   521  	// via the formula variance = squareddiffs / (count - 1).
   522  	// Since we're adjusting the count, we must re-compute a value
   523  	// for SquaredDiffs that keeps the same variance with the new count.
   524  	oldCountMinusOne := float64(oldCount - 1)
   525  	newCountMinusOne := float64(newCount - 1)
   526  	d.NumRows.SquaredDiffs = (d.NumRows.SquaredDiffs / oldCountMinusOne) * newCountMinusOne
   527  	d.ParseLat.SquaredDiffs = (d.ParseLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne
   528  	d.PlanLat.SquaredDiffs = (d.PlanLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne
   529  	d.RunLat.SquaredDiffs = (d.RunLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne
   530  	d.ServiceLat.SquaredDiffs = (d.ServiceLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne
   531  	d.OverheadLat.SquaredDiffs = (d.OverheadLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne
   532  
   533  	d.MaxRetries = telemetry.Bucket10(d.MaxRetries)
   534  
   535  	d.FirstAttemptCount = int64((float64(d.FirstAttemptCount) / float64(oldCount)) * float64(newCount))
   536  }
   537  
   538  // FailedHashedValue is used as a default return value for when HashForReporting
   539  // cannot hash a value correctly.
   540  const FailedHashedValue = "unknown"
   541  
   542  // HashForReporting 1-way hashes values for use in stat reporting. The secret
   543  // should be the cluster.secret setting.
   544  func HashForReporting(secret, appName string) string {
   545  	// If no secret is provided, we cannot irreversibly hash the value, so return
   546  	// a default value.
   547  	if len(secret) == 0 {
   548  		return FailedHashedValue
   549  	}
   550  	hash := hmac.New(sha256.New, []byte(secret))
   551  	if _, err := hash.Write([]byte(appName)); err != nil {
   552  		panic(errors.NewAssertionErrorWithWrappedErrf(err,
   553  			`"It never returns an error." -- https://golang.org/pkg/hash`))
   554  	}
   555  	return hex.EncodeToString(hash.Sum(nil)[:4])
   556  }