github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/app_stats.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sql 12 13 import ( 14 "bytes" 15 "context" 16 "crypto/hmac" 17 "crypto/sha256" 18 "encoding/hex" 19 "encoding/json" 20 "fmt" 21 "strings" 22 "time" 23 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 26 "github.com/cockroachdb/cockroach/pkg/settings" 27 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 28 "github.com/cockroachdb/cockroach/pkg/sql/parser" 29 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 30 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 31 "github.com/cockroachdb/cockroach/pkg/util/log" 32 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 33 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 34 "github.com/cockroachdb/errors" 35 ) 36 37 type stmtKey struct { 38 stmt string 39 failed bool 40 distSQLUsed bool 41 implicitTxn bool 42 } 43 44 // appStats holds per-application statistics. 45 type appStats struct { 46 syncutil.Mutex 47 48 st *cluster.Settings 49 stmts map[stmtKey]*stmtStats 50 txns transactionStats 51 } 52 53 // stmtStats holds per-statement statistics. 54 type stmtStats struct { 55 syncutil.Mutex 56 57 data roachpb.StatementStatistics 58 } 59 60 // transactionStats holds per-application transaction statistics. 61 type transactionStats struct { 62 mu struct { 63 syncutil.Mutex 64 roachpb.TxnStats 65 } 66 } 67 68 // stmtStatsEnable determines whether to collect per-statement 69 // statistics. 70 var stmtStatsEnable = settings.RegisterPublicBoolSetting( 71 "sql.metrics.statement_details.enabled", "collect per-statement query statistics", true, 72 ) 73 74 // txnStatsEnable determines whether to collect per-application transaction 75 // statistics. 76 var txnStatsEnable = settings.RegisterPublicBoolSetting( 77 "sql.metrics.transaction_details.enabled", "collect per-application transaction statistics", true, 78 ) 79 80 // sqlStatsCollectionLatencyThreshold specifies the minimum amount of time 81 // consumed by a SQL statement before it is collected for statistics reporting. 82 var sqlStatsCollectionLatencyThreshold = settings.RegisterPublicDurationSetting( 83 "sql.metrics.statement_details.threshold", 84 "minimum execution time to cause statistics to be collected", 85 0, 86 ) 87 88 var dumpStmtStatsToLogBeforeReset = settings.RegisterPublicBoolSetting( 89 "sql.metrics.statement_details.dump_to_logs", 90 "dump collected statement statistics to node logs when periodically cleared", 91 false, 92 ) 93 94 var sampleLogicalPlans = settings.RegisterPublicBoolSetting( 95 "sql.metrics.statement_details.plan_collection.enabled", 96 "periodically save a logical plan for each fingerprint", 97 true, 98 ) 99 100 var logicalPlanCollectionPeriod = settings.RegisterPublicNonNegativeDurationSetting( 101 "sql.metrics.statement_details.plan_collection.period", 102 "the time until a new logical plan is collected", 103 5*time.Minute, 104 ) 105 106 func (s stmtKey) String() string { 107 return s.flags() + s.stmt 108 } 109 110 func (s stmtKey) flags() string { 111 var b bytes.Buffer 112 if s.failed { 113 b.WriteByte('!') 114 } 115 if s.distSQLUsed { 116 b.WriteByte('+') 117 } 118 return b.String() 119 } 120 121 // recordStatement saves per-statement statistics. 122 // 123 // samplePlanDescription can be nil, as these are only sampled periodically per unique fingerprint. 124 func (a *appStats) recordStatement( 125 stmt *Statement, 126 samplePlanDescription *roachpb.ExplainTreePlanNode, 127 distSQLUsed bool, 128 implicitTxn bool, 129 automaticRetryCount int, 130 numRows int, 131 err error, 132 parseLat, planLat, runLat, svcLat, ovhLat float64, 133 bytesRead, rowsRead int64, 134 ) { 135 if !stmtStatsEnable.Get(&a.st.SV) { 136 return 137 } 138 139 if t := sqlStatsCollectionLatencyThreshold.Get(&a.st.SV); t > 0 && t.Seconds() >= svcLat { 140 return 141 } 142 143 // Get the statistics object. 144 s := a.getStatsForStmt(stmt, distSQLUsed, implicitTxn, err, true /* createIfNonexistent */) 145 146 // Collect the per-statement statistics. 147 s.Lock() 148 s.data.Count++ 149 if err != nil { 150 s.data.SensitiveInfo.LastErr = err.Error() 151 } 152 // Only update MostRecentPlanDescription if we sampled a new PlanDescription. 153 if samplePlanDescription != nil { 154 s.data.SensitiveInfo.MostRecentPlanDescription = *samplePlanDescription 155 s.data.SensitiveInfo.MostRecentPlanTimestamp = timeutil.Now() 156 } 157 if automaticRetryCount == 0 { 158 s.data.FirstAttemptCount++ 159 } else if int64(automaticRetryCount) > s.data.MaxRetries { 160 s.data.MaxRetries = int64(automaticRetryCount) 161 } 162 s.data.NumRows.Record(s.data.Count, float64(numRows)) 163 s.data.ParseLat.Record(s.data.Count, parseLat) 164 s.data.PlanLat.Record(s.data.Count, planLat) 165 s.data.RunLat.Record(s.data.Count, runLat) 166 s.data.ServiceLat.Record(s.data.Count, svcLat) 167 s.data.OverheadLat.Record(s.data.Count, ovhLat) 168 s.data.BytesRead = bytesRead 169 s.data.RowsRead = rowsRead 170 s.Unlock() 171 } 172 173 // getStatsForStmt retrieves the per-stmt stat object. 174 func (a *appStats) getStatsForStmt( 175 stmt *Statement, distSQLUsed bool, implicitTxn bool, err error, createIfNonexistent bool, 176 ) *stmtStats { 177 // Extend the statement key with various characteristics, so 178 // that we use separate buckets for the different situations. 179 key := stmtKey{failed: err != nil, distSQLUsed: distSQLUsed, implicitTxn: implicitTxn} 180 if stmt.AnonymizedStr != "" { 181 // Use the cached anonymized string. 182 key.stmt = stmt.AnonymizedStr 183 } else { 184 key.stmt = anonymizeStmt(stmt.AST) 185 } 186 187 return a.getStatsForStmtWithKey(key, createIfNonexistent) 188 } 189 190 func (a *appStats) getStatsForStmtWithKey(key stmtKey, createIfNonexistent bool) *stmtStats { 191 a.Lock() 192 // Retrieve the per-statement statistic object, and create it if it 193 // doesn't exist yet. 194 s, ok := a.stmts[key] 195 if !ok && createIfNonexistent { 196 s = &stmtStats{} 197 a.stmts[key] = s 198 } 199 a.Unlock() 200 return s 201 } 202 203 // Add combines one appStats into another. Add manages locks on a, so taking 204 // a lock on a will cause a deadlock. 205 func (a *appStats) Add(other *appStats) { 206 other.Lock() 207 statMap := make(map[stmtKey]*stmtStats) 208 for k, v := range other.stmts { 209 statMap[k] = v 210 } 211 other.Unlock() 212 213 // Copy the statement stats for each statement key. 214 for k, v := range statMap { 215 v.Lock() 216 statCopy := &stmtStats{data: v.data} 217 v.Unlock() 218 statMap[k] = statCopy 219 } 220 221 // Merge the statement stats. 222 for k, v := range statMap { 223 s := a.getStatsForStmtWithKey(k, true) 224 s.Lock() 225 // Note that we don't need to take a lock on v because 226 // no other thread knows about v yet. 227 s.data.Add(&v.data) 228 s.Unlock() 229 } 230 231 // Create a copy of the other's transactions statistics. 232 other.txns.mu.Lock() 233 txnStats := other.txns.mu.TxnStats 234 other.txns.mu.Unlock() 235 236 // Merge the transaction stats. 237 a.txns.mu.Lock() 238 a.txns.mu.TxnStats.Add(txnStats) 239 a.txns.mu.Unlock() 240 } 241 242 func anonymizeStmt(ast tree.Statement) string { 243 return tree.AsStringWithFlags(ast, tree.FmtHideConstants) 244 } 245 246 func (s *transactionStats) getStats() ( 247 txnCount int64, 248 txnTimeAvg float64, 249 txnTimeVar float64, 250 committedCount int64, 251 implicitCount int64, 252 ) { 253 s.mu.Lock() 254 defer s.mu.Unlock() 255 txnCount = s.mu.TxnCount 256 txnTimeAvg = s.mu.TxnTimeSec.Mean 257 txnTimeVar = s.mu.TxnTimeSec.GetVariance(txnCount) 258 committedCount = s.mu.CommittedCount 259 implicitCount = s.mu.ImplicitCount 260 return txnCount, txnTimeAvg, txnTimeVar, committedCount, implicitCount 261 } 262 263 func (s *transactionStats) recordTransaction(txnTimeSec float64, ev txnEvent, implicit bool) { 264 s.mu.Lock() 265 defer s.mu.Unlock() 266 s.mu.TxnCount++ 267 s.mu.TxnTimeSec.Record(s.mu.TxnCount, txnTimeSec) 268 if ev == txnCommit { 269 s.mu.CommittedCount++ 270 } 271 if implicit { 272 s.mu.ImplicitCount++ 273 } 274 } 275 276 func (a *appStats) recordTransaction(txnTimeSec float64, ev txnEvent, implicit bool) { 277 if !txnStatsEnable.Get(&a.st.SV) { 278 return 279 } 280 a.txns.recordTransaction(txnTimeSec, ev, implicit) 281 } 282 283 // shouldSaveLogicalPlanDescription returns whether we should save this as a 284 // sample logical plan for its corresponding fingerprint. We use 285 // `logicalPlanCollectionPeriod` to assess how frequently to sample logical 286 // plans. 287 func (a *appStats) shouldSaveLogicalPlanDescription( 288 stmt *Statement, useDistSQL bool, implicitTxn bool, err error, 289 ) bool { 290 if !sampleLogicalPlans.Get(&a.st.SV) { 291 return false 292 } 293 stats := a.getStatsForStmt(stmt, useDistSQL, implicitTxn, err, false /* createIfNonexistent */) 294 if stats == nil { 295 // Save logical plan the first time we see new statement fingerprint. 296 return true 297 } 298 now := timeutil.Now() 299 period := logicalPlanCollectionPeriod.Get(&a.st.SV) 300 stats.Lock() 301 defer stats.Unlock() 302 timeLastSampled := stats.data.SensitiveInfo.MostRecentPlanTimestamp 303 return now.Sub(timeLastSampled) >= period 304 } 305 306 // sqlStats carries per-application statistics for all applications. 307 type sqlStats struct { 308 syncutil.Mutex 309 310 st *cluster.Settings 311 // lastReset is the time at which the app containers were reset. 312 lastReset time.Time 313 // apps is the container for all the per-application statistics objects. 314 apps map[string]*appStats 315 } 316 317 func (s *sqlStats) getStatsForApplication(appName string) *appStats { 318 s.Lock() 319 defer s.Unlock() 320 if a, ok := s.apps[appName]; ok { 321 return a 322 } 323 a := &appStats{ 324 st: s.st, 325 stmts: make(map[stmtKey]*stmtStats), 326 } 327 s.apps[appName] = a 328 return a 329 } 330 331 // resetAndMaybeDumpStats clears all the stored per-app and per-statement 332 // statistics. If target s not nil, then the stats in s will be flushed 333 // into target. 334 func (s *sqlStats) resetAndMaybeDumpStats(ctx context.Context, target *sqlStats) { 335 // Note: we do not clear the entire s.apps map here. We would need 336 // to do so to prevent problems with a runaway client running `SET 337 // APPLICATION_NAME=...` with a different name every time. However, 338 // any ongoing open client session at the time of the reset has 339 // cached a pointer to its appStats struct and would thus continue 340 // to report its stats in an object now invisible to the target tools 341 // (virtual table, marshaling, etc.). It's a judgement call, but 342 // for now we prefer to see more data and thus not clear the map, at 343 // the risk of seeing the map grow unboundedly with the number of 344 // different application_names seen so far. 345 346 // appStatsCopy will hold a snapshot of the stats being cleared 347 // to dump into target. 348 var appStatsCopy map[string]*appStats 349 350 s.Lock() 351 352 if target != nil { 353 appStatsCopy = make(map[string]*appStats, len(s.apps)) 354 } 355 356 // Clear the per-apps maps manually, 357 // because any SQL session currently open has cached the 358 // pointer to its appStats object and will continue to 359 // accumulate data using that until it closes (or changes its 360 // application_name). 361 for appName, a := range s.apps { 362 a.Lock() 363 364 // Save the existing data to logs. 365 // TODO(knz/dt): instead of dumping the stats to the log, save 366 // them in a SQL table so they can be inspected by the DBA and/or 367 // the UI. 368 if dumpStmtStatsToLogBeforeReset.Get(&a.st.SV) { 369 dumpStmtStats(ctx, appName, a.stmts) 370 } 371 372 // Only save a copy of a if we need to dump a copy of the stats. 373 if target != nil { 374 aCopy := &appStats{st: a.st, stmts: a.stmts} 375 appStatsCopy[appName] = aCopy 376 } 377 378 // Clear the map, to release the memory; make the new map somewhat already 379 // large for the likely future workload. 380 a.stmts = make(map[stmtKey]*stmtStats, len(a.stmts)/2) 381 a.Unlock() 382 } 383 s.lastReset = timeutil.Now() 384 s.Unlock() 385 386 // Dump the copied stats into target. 387 if target != nil { 388 for k, v := range appStatsCopy { 389 stats := target.getStatsForApplication(k) 390 // Add manages locks for itself, so we don't need to guard it with locks. 391 stats.Add(v) 392 } 393 } 394 } 395 396 func (s *sqlStats) getLastReset() time.Time { 397 s.Lock() 398 defer s.Unlock() 399 return s.lastReset 400 } 401 402 // Save the existing data for an application to the info log. 403 func dumpStmtStats(ctx context.Context, appName string, stats map[stmtKey]*stmtStats) { 404 if len(stats) == 0 { 405 return 406 } 407 var buf bytes.Buffer 408 for key, s := range stats { 409 s.Lock() 410 json, err := json.Marshal(s.data) 411 s.Unlock() 412 if err != nil { 413 log.Errorf(ctx, "error while marshaling stats for %q // %q: %v", appName, key.String(), err) 414 continue 415 } 416 fmt.Fprintf(&buf, "%q: %s\n", key.String(), json) 417 } 418 log.Infof(ctx, "Statistics for %q:\n%s", appName, buf.String()) 419 } 420 421 func scrubStmtStatKey(vt VirtualTabler, key string) (string, bool) { 422 // Re-parse the statement to obtain its AST. 423 stmt, err := parser.ParseOne(key) 424 if err != nil { 425 return "", false 426 } 427 428 // Re-format to remove most names. 429 f := tree.NewFmtCtx(tree.FmtAnonymize) 430 431 reformatFn := func(ctx *tree.FmtCtx, tn *tree.TableName) { 432 virtual, err := vt.getVirtualTableEntry(tn) 433 if err != nil || virtual.desc == nil { 434 ctx.WriteByte('_') 435 return 436 } 437 // Virtual table: we want to keep the name; however 438 // we need to scrub the database name prefix. 439 newTn := *tn 440 newTn.CatalogName = "_" 441 442 ctx.WithFlags(tree.FmtParsable, func() { 443 ctx.WithReformatTableNames(nil, func() { 444 ctx.FormatNode(&newTn) 445 }) 446 }) 447 } 448 f.SetReformatTableNames(reformatFn) 449 f.FormatNode(stmt.AST) 450 return f.CloseAndGetString(), true 451 } 452 453 func (s *sqlStats) getScrubbedStmtStats( 454 vt *VirtualSchemaHolder, 455 ) []roachpb.CollectedStatementStatistics { 456 return s.getStmtStats(vt, true /* scrub */) 457 } 458 459 func (s *sqlStats) getUnscrubbedStmtStats( 460 vt *VirtualSchemaHolder, 461 ) []roachpb.CollectedStatementStatistics { 462 return s.getStmtStats(vt, false /* scrub */) 463 } 464 465 func (s *sqlStats) getStmtStats( 466 vt *VirtualSchemaHolder, scrub bool, 467 ) []roachpb.CollectedStatementStatistics { 468 s.Lock() 469 defer s.Unlock() 470 var ret []roachpb.CollectedStatementStatistics 471 salt := ClusterSecret.Get(&s.st.SV) 472 for appName, a := range s.apps { 473 a.Lock() 474 if cap(ret) == 0 { 475 // guesstimate that we'll need apps*(queries-per-app). 476 ret = make([]roachpb.CollectedStatementStatistics, 0, len(a.stmts)*len(s.apps)) 477 } 478 for q, stats := range a.stmts { 479 maybeScrubbed := q.stmt 480 maybeHashedAppName := appName 481 ok := true 482 if scrub { 483 maybeScrubbed, ok = scrubStmtStatKey(vt, q.stmt) 484 if !strings.HasPrefix(appName, sqlbase.ReportableAppNamePrefix) { 485 maybeHashedAppName = HashForReporting(salt, appName) 486 } 487 } 488 if ok { 489 k := roachpb.StatementStatisticsKey{ 490 Query: maybeScrubbed, 491 DistSQL: q.distSQLUsed, 492 Opt: true, 493 ImplicitTxn: q.implicitTxn, 494 Failed: q.failed, 495 App: maybeHashedAppName, 496 } 497 stats.Lock() 498 data := stats.data 499 stats.Unlock() 500 501 if scrub { 502 // Quantize the counts to avoid leaking information that way. 503 quantizeCounts(&data) 504 data.SensitiveInfo = data.SensitiveInfo.GetScrubbedCopy() 505 } 506 507 ret = append(ret, roachpb.CollectedStatementStatistics{Key: k, Stats: data}) 508 } 509 } 510 a.Unlock() 511 } 512 return ret 513 } 514 515 // quantizeCounts ensures that the counts are bucketed into "simple" values. 516 func quantizeCounts(d *roachpb.StatementStatistics) { 517 oldCount := d.Count 518 newCount := telemetry.Bucket10(oldCount) 519 d.Count = newCount 520 // The SquaredDiffs values are meant to enable computing the variance 521 // via the formula variance = squareddiffs / (count - 1). 522 // Since we're adjusting the count, we must re-compute a value 523 // for SquaredDiffs that keeps the same variance with the new count. 524 oldCountMinusOne := float64(oldCount - 1) 525 newCountMinusOne := float64(newCount - 1) 526 d.NumRows.SquaredDiffs = (d.NumRows.SquaredDiffs / oldCountMinusOne) * newCountMinusOne 527 d.ParseLat.SquaredDiffs = (d.ParseLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne 528 d.PlanLat.SquaredDiffs = (d.PlanLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne 529 d.RunLat.SquaredDiffs = (d.RunLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne 530 d.ServiceLat.SquaredDiffs = (d.ServiceLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne 531 d.OverheadLat.SquaredDiffs = (d.OverheadLat.SquaredDiffs / oldCountMinusOne) * newCountMinusOne 532 533 d.MaxRetries = telemetry.Bucket10(d.MaxRetries) 534 535 d.FirstAttemptCount = int64((float64(d.FirstAttemptCount) / float64(oldCount)) * float64(newCount)) 536 } 537 538 // FailedHashedValue is used as a default return value for when HashForReporting 539 // cannot hash a value correctly. 540 const FailedHashedValue = "unknown" 541 542 // HashForReporting 1-way hashes values for use in stat reporting. The secret 543 // should be the cluster.secret setting. 544 func HashForReporting(secret, appName string) string { 545 // If no secret is provided, we cannot irreversibly hash the value, so return 546 // a default value. 547 if len(secret) == 0 { 548 return FailedHashedValue 549 } 550 hash := hmac.New(sha256.New, []byte(secret)) 551 if _, err := hash.Write([]byte(appName)); err != nil { 552 panic(errors.NewAssertionErrorWithWrappedErrf(err, 553 `"It never returns an error." -- https://golang.org/pkg/hash`)) 554 } 555 return hex.EncodeToString(hash.Sum(nil)[:4]) 556 }