github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/stmtdiagnostics/statement_diagnostics.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package stmtdiagnostics
    12  
    13  import (
    14  	"context"
    15  	"encoding/binary"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/gossip"
    19  	"github.com/cockroachdb/cockroach/pkg/kv"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/security"
    22  	"github.com/cockroachdb/cockroach/pkg/settings"
    23  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    27  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    28  	"github.com/cockroachdb/cockroach/pkg/util"
    29  	"github.com/cockroachdb/cockroach/pkg/util/log"
    30  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    31  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    32  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    33  	"github.com/cockroachdb/errors"
    34  )
    35  
    36  var stmtDiagnosticsPollingInterval = settings.RegisterDurationSetting(
    37  	"sql.stmt_diagnostics.poll_interval",
    38  	"rate at which the stmtdiagnostics.Registry polls for requests, set to zero to disable",
    39  	10*time.Second)
    40  
    41  // Registry maintains a view on the statement fingerprints
    42  // on which data is to be collected (i.e. system.statement_diagnostics_requests)
    43  // and provides utilities for checking a query against this list and satisfying
    44  // the requests.
    45  type Registry struct {
    46  	mu struct {
    47  		// NOTE: This lock can't be held while the registry runs any statements
    48  		// internally; it'd deadlock.
    49  		syncutil.Mutex
    50  		// requests waiting for the right query to come along.
    51  		requestFingerprints map[requestID]string
    52  		// ids of requests that this node is in the process of servicing.
    53  		ongoing map[requestID]struct{}
    54  
    55  		// epoch is observed before reading system.statement_diagnostics_requests, and then
    56  		// checked again before loading the tables contents. If the value changed in
    57  		// between, then the table contents might be stale.
    58  		epoch int
    59  	}
    60  	st     *cluster.Settings
    61  	ie     sqlutil.InternalExecutor
    62  	db     *kv.DB
    63  	gossip gossip.DeprecatedGossip
    64  
    65  	// gossipUpdateChan is used to notify the polling loop that a diagnostics
    66  	// request has been added. The gossip callback will not block sending on this
    67  	// channel.
    68  	gossipUpdateChan chan requestID
    69  }
    70  
    71  // NewRegistry constructs a new Registry.
    72  func NewRegistry(
    73  	ie sqlutil.InternalExecutor, db *kv.DB, gw gossip.DeprecatedGossip, st *cluster.Settings,
    74  ) *Registry {
    75  	r := &Registry{
    76  		ie:               ie,
    77  		db:               db,
    78  		gossip:           gw,
    79  		gossipUpdateChan: make(chan requestID, 1),
    80  		st:               st,
    81  	}
    82  	// Some tests pass a nil gossip, and gossip is not available on SQL tenant
    83  	// servers.
    84  	g, ok := gw.Optional(47893)
    85  	if ok && g != nil {
    86  		g.RegisterCallback(gossip.KeyGossipStatementDiagnosticsRequest, r.gossipNotification)
    87  	}
    88  	return r
    89  }
    90  
    91  // Start will start the polling loop for the Registry.
    92  func (r *Registry) Start(ctx context.Context, stopper *stop.Stopper) {
    93  	ctx, _ = stopper.WithCancelOnQuiesce(ctx)
    94  	// NB: The only error that should occur here would be if the server were
    95  	// shutting down so let's swallow it.
    96  	_ = stopper.RunAsyncTask(ctx, "stmt-diag-poll", r.poll)
    97  }
    98  
    99  func (r *Registry) poll(ctx context.Context) {
   100  	var (
   101  		timer               timeutil.Timer
   102  		lastPoll            time.Time
   103  		deadline            time.Time
   104  		pollIntervalChanged = make(chan struct{}, 1)
   105  		maybeResetTimer     = func() {
   106  			if interval := stmtDiagnosticsPollingInterval.Get(&r.st.SV); interval <= 0 {
   107  				// Setting the interval to a non-positive value stops the polling.
   108  				timer.Stop()
   109  			} else {
   110  				newDeadline := lastPoll.Add(interval)
   111  				if deadline.IsZero() || !deadline.Equal(newDeadline) {
   112  					deadline = newDeadline
   113  					timer.Reset(timeutil.Until(deadline))
   114  				}
   115  			}
   116  		}
   117  		poll = func() {
   118  			if err := r.pollRequests(ctx); err != nil {
   119  				if ctx.Err() != nil {
   120  					return
   121  				}
   122  				log.Warningf(ctx, "error polling for statement diagnostics requests: %s", err)
   123  			}
   124  			lastPoll = timeutil.Now()
   125  		}
   126  	)
   127  	stmtDiagnosticsPollingInterval.SetOnChange(&r.st.SV, func() {
   128  		select {
   129  		case pollIntervalChanged <- struct{}{}:
   130  		default:
   131  		}
   132  	})
   133  	for {
   134  		maybeResetTimer()
   135  		select {
   136  		case <-pollIntervalChanged:
   137  			continue // go back around and maybe reset the timer
   138  		case reqID := <-r.gossipUpdateChan:
   139  			if r.findRequest(reqID) {
   140  				continue // request already exists, don't do anything
   141  			}
   142  			// Poll the data.
   143  		case <-timer.C:
   144  			timer.Read = true
   145  		case <-ctx.Done():
   146  			return
   147  		}
   148  		poll()
   149  	}
   150  }
   151  
   152  // requestID is the ID of a diagnostics request, corresponding to the id
   153  // column in statement_diagnostics_requests.
   154  // A zero ID is invalid.
   155  type requestID int
   156  
   157  // ID is the ID of an instance of collected diagnostics, corresponding
   158  // to the id column in statement_diagnostics.
   159  type stmtID int
   160  
   161  // addRequestInternalLocked adds a request to r.mu.requests. If the request is
   162  // already present, the call is a noop.
   163  func (r *Registry) addRequestInternalLocked(
   164  	ctx context.Context, id requestID, queryFingerprint string,
   165  ) {
   166  	if r.findRequestLocked(id) {
   167  		// Request already exists.
   168  		return
   169  	}
   170  	if r.mu.requestFingerprints == nil {
   171  		r.mu.requestFingerprints = make(map[requestID]string)
   172  	}
   173  	r.mu.requestFingerprints[id] = queryFingerprint
   174  }
   175  
   176  func (r *Registry) findRequest(requestID requestID) bool {
   177  	r.mu.Lock()
   178  	defer r.mu.Unlock()
   179  	return r.findRequestLocked(requestID)
   180  }
   181  
   182  func (r *Registry) findRequestLocked(requestID requestID) bool {
   183  	_, ok := r.mu.requestFingerprints[requestID]
   184  	if ok {
   185  		return true
   186  	}
   187  	_, ok = r.mu.ongoing[requestID]
   188  	return ok
   189  }
   190  
   191  // InsertRequest is part of the StmtDiagnosticsRequester interface.
   192  func (r *Registry) InsertRequest(ctx context.Context, fprint string) error {
   193  	_, err := r.insertRequestInternal(ctx, fprint)
   194  	return err
   195  }
   196  
   197  func (r *Registry) insertRequestInternal(ctx context.Context, fprint string) (requestID, error) {
   198  	g, err := r.gossip.OptionalErr(48274)
   199  	if err != nil {
   200  		return 0, err
   201  	}
   202  
   203  	var reqID requestID
   204  	err = r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   205  		// Check if there's already a pending request for this fingerprint.
   206  		row, err := r.ie.QueryRowEx(ctx, "stmt-diag-check-pending", txn,
   207  			sqlbase.InternalExecutorSessionDataOverride{
   208  				User: security.RootUser,
   209  			},
   210  			"SELECT count(1) FROM system.statement_diagnostics_requests "+
   211  				"WHERE completed = false AND statement_fingerprint = $1",
   212  			fprint)
   213  		if err != nil {
   214  			return err
   215  		}
   216  		count := int(*row[0].(*tree.DInt))
   217  		if count != 0 {
   218  			return errors.New("a pending request for the requested fingerprint already exists")
   219  		}
   220  
   221  		row, err = r.ie.QueryRowEx(ctx, "stmt-diag-insert-request", txn,
   222  			sqlbase.InternalExecutorSessionDataOverride{
   223  				User: security.RootUser,
   224  			},
   225  			"INSERT INTO system.statement_diagnostics_requests (statement_fingerprint, requested_at) "+
   226  				"VALUES ($1, $2) RETURNING id",
   227  			fprint, timeutil.Now())
   228  		if err != nil {
   229  			return err
   230  		}
   231  		reqID = requestID(*row[0].(*tree.DInt))
   232  		return nil
   233  	})
   234  	if err != nil {
   235  		return 0, err
   236  	}
   237  
   238  	// Manually insert the request in the (local) registry. This lets this node
   239  	// pick up the request quickly if the right query comes around, without
   240  	// waiting for the poller.
   241  	r.mu.Lock()
   242  	defer r.mu.Unlock()
   243  	r.mu.epoch++
   244  	r.addRequestInternalLocked(ctx, reqID, fprint)
   245  
   246  	// Notify all the other nodes that they have to poll.
   247  	buf := make([]byte, 8)
   248  	binary.LittleEndian.PutUint64(buf, uint64(reqID))
   249  	if err := g.AddInfo(gossip.KeyGossipStatementDiagnosticsRequest, buf, 0 /* ttl */); err != nil {
   250  		log.Warningf(ctx, "error notifying of diagnostics request: %s", err)
   251  	}
   252  
   253  	return reqID, nil
   254  }
   255  
   256  func (r *Registry) removeOngoing(requestID requestID) {
   257  	r.mu.Lock()
   258  	defer r.mu.Unlock()
   259  	// Remove the request from r.mu.ongoing.
   260  	delete(r.mu.ongoing, requestID)
   261  }
   262  
   263  // ShouldCollectDiagnostics checks whether any data should be collected for the
   264  // given query, which is the case if the registry has a request for this
   265  // statement's fingerprint; in this case ShouldCollectDiagnostics will not
   266  // return true again on this note for the same diagnostics request.
   267  //
   268  // If data is to be collected, Finish() must always be called on the returned
   269  // stmtDiagnosticsHelper once the data was collected.
   270  func (r *Registry) ShouldCollectDiagnostics(
   271  	ctx context.Context, ast tree.Statement,
   272  ) (bool, func(ctx context.Context, traceJSON tree.Datum, bundle []byte, collectionErr error)) {
   273  	r.mu.Lock()
   274  	defer r.mu.Unlock()
   275  
   276  	// Return quickly if we have no requests to trace.
   277  	if len(r.mu.requestFingerprints) == 0 {
   278  		return false, nil
   279  	}
   280  
   281  	fingerprint := tree.AsStringWithFlags(ast, tree.FmtHideConstants)
   282  	var reqID requestID
   283  	for id, f := range r.mu.requestFingerprints {
   284  		if f == fingerprint {
   285  			reqID = id
   286  			break
   287  		}
   288  	}
   289  	if reqID == 0 {
   290  		return false, nil
   291  	}
   292  
   293  	// Remove the request.
   294  	delete(r.mu.requestFingerprints, reqID)
   295  	if r.mu.ongoing == nil {
   296  		r.mu.ongoing = make(map[requestID]struct{})
   297  	}
   298  
   299  	r.mu.ongoing[reqID] = struct{}{}
   300  	return true, makeStmtDiagnosticsHelper(r, fingerprint, tree.AsString(ast), reqID).Finish
   301  }
   302  
   303  type stmtDiagnosticsHelper struct {
   304  	r            *Registry
   305  	fingerprint  string
   306  	statementStr string
   307  	requestID    requestID
   308  }
   309  
   310  func makeStmtDiagnosticsHelper(
   311  	r *Registry, fingerprint string, statementStr string, requestID requestID,
   312  ) *stmtDiagnosticsHelper {
   313  	return &stmtDiagnosticsHelper{
   314  		r:            r,
   315  		fingerprint:  fingerprint,
   316  		statementStr: statementStr,
   317  		requestID:    requestID,
   318  	}
   319  }
   320  
   321  // Finish reports the trace and creates the support bundle, and inserts them in
   322  // the system tables.
   323  func (h *stmtDiagnosticsHelper) Finish(
   324  	ctx context.Context, traceJSON tree.Datum, bundle []byte, collectionErr error,
   325  ) {
   326  	defer h.r.removeOngoing(h.requestID)
   327  
   328  	_, err := h.r.insertStatementDiagnostics(
   329  		ctx,
   330  		h.requestID,
   331  		h.fingerprint,
   332  		h.statementStr,
   333  		traceJSON,
   334  		bundle,
   335  		collectionErr,
   336  	)
   337  	if err != nil {
   338  		log.Warningf(ctx, "failed to report statement diagnostics: %s", err)
   339  	}
   340  }
   341  
   342  // InsertStatementDiagnostics inserts a trace into system.statement_diagnostics.
   343  //
   344  // traceJSON is either DNull (when collectionErr should not be nil) or a *DJSON.
   345  func (r *Registry) InsertStatementDiagnostics(
   346  	ctx context.Context, stmtFingerprint string, stmt string, traceJSON tree.Datum, bundle []byte,
   347  ) (int64, error) {
   348  	id, err := r.insertStatementDiagnostics(ctx, 0, /* requestID */
   349  		stmtFingerprint, stmt, traceJSON, bundle, nil /* collectionErr */)
   350  	return int64(id), err
   351  }
   352  
   353  // insertStatementDiagnostics inserts a trace into system.statement_diagnostics.
   354  //
   355  // traceJSON is either DNull (when collectionErr should not be nil) or a *DJSON.
   356  //
   357  // It also marks the request as completed in
   358  // system.statement_diagnostics_requests. If requestID is zero, a new entry is
   359  // inserted.
   360  //
   361  // collectionErr should be any error generated during the serialization of the
   362  // collected trace.
   363  func (r *Registry) insertStatementDiagnostics(
   364  	ctx context.Context,
   365  	requestID requestID,
   366  	stmtFingerprint string,
   367  	stmt string,
   368  	traceJSON tree.Datum,
   369  	bundle []byte,
   370  	collectionErr error,
   371  ) (stmtID, error) {
   372  	var diagID stmtID
   373  	err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   374  		if requestID != 0 {
   375  			row, err := r.ie.QueryRowEx(ctx, "stmt-diag-check-completed", txn,
   376  				sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   377  				"SELECT count(1) FROM system.statement_diagnostics_requests WHERE id = $1 AND completed = false",
   378  				requestID)
   379  			if err != nil {
   380  				return err
   381  			}
   382  			cnt := int(*row[0].(*tree.DInt))
   383  			if cnt == 0 {
   384  				// Someone else already marked the request as completed. We've traced for nothing.
   385  				// This can only happen once per node, per request since we're going to
   386  				// remove the request from the registry.
   387  				return nil
   388  			}
   389  		}
   390  
   391  		// Generate the values that will be inserted.
   392  		errorVal := tree.DNull
   393  		if collectionErr != nil {
   394  			errorVal = tree.NewDString(collectionErr.Error())
   395  		}
   396  
   397  		bundleChunksVal := tree.DNull
   398  		if len(bundle) != 0 {
   399  			// Insert the bundle into system.statement_bundle_chunks.
   400  			// TODO(radu): split in chunks.
   401  			row, err := r.ie.QueryRowEx(
   402  				ctx, "stmt-bundle-chunks-insert", txn,
   403  				sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   404  				"INSERT INTO system.statement_bundle_chunks(description, data) VALUES ($1, $2) RETURNING id",
   405  				"statement diagnostics bundle",
   406  				tree.NewDBytes(tree.DBytes(bundle)),
   407  			)
   408  			if err != nil {
   409  				return err
   410  			}
   411  			chunkID := row[0].(*tree.DInt)
   412  
   413  			array := tree.NewDArray(types.Int)
   414  			if err := array.Append(chunkID); err != nil {
   415  				return err
   416  			}
   417  			bundleChunksVal = array
   418  		}
   419  
   420  		collectionTime := timeutil.Now()
   421  
   422  		// Insert the trace into system.statement_diagnostics.
   423  		row, err := r.ie.QueryRowEx(
   424  			ctx, "stmt-diag-insert", txn,
   425  			sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   426  			"INSERT INTO system.statement_diagnostics "+
   427  				"(statement_fingerprint, statement, collected_at, trace, bundle_chunks, error) "+
   428  				"VALUES ($1, $2, $3, $4, $5, $6) RETURNING id",
   429  			stmtFingerprint, stmt, collectionTime, traceJSON, bundleChunksVal, errorVal,
   430  		)
   431  		if err != nil {
   432  			return err
   433  		}
   434  		diagID = stmtID(*row[0].(*tree.DInt))
   435  
   436  		if requestID != 0 {
   437  			// Mark the request from system.statement_diagnostics_request as completed.
   438  			_, err := r.ie.ExecEx(ctx, "stmt-diag-mark-completed", txn,
   439  				sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   440  				"UPDATE system.statement_diagnostics_requests "+
   441  					"SET completed = true, statement_diagnostics_id = $1 WHERE id = $2",
   442  				diagID, requestID)
   443  			if err != nil {
   444  				return err
   445  			}
   446  		} else {
   447  			// Insert a completed request into system.statement_diagnostics_request.
   448  			// This is necessary because the UI uses this table to discover completed
   449  			// diagnostics.
   450  			_, err := r.ie.ExecEx(ctx, "stmt-diag-add-completed", txn,
   451  				sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   452  				"INSERT INTO system.statement_diagnostics_requests"+
   453  					" (completed, statement_fingerprint, statement_diagnostics_id, requested_at)"+
   454  					" VALUES (true, $1, $2, $3)",
   455  				stmtFingerprint, diagID, collectionTime)
   456  			if err != nil {
   457  				return err
   458  			}
   459  		}
   460  		return nil
   461  	})
   462  	if err != nil {
   463  		return 0, err
   464  	}
   465  	return diagID, nil
   466  }
   467  
   468  // pollRequests reads the pending rows from system.statement_diagnostics_requests and
   469  // updates r.mu.requests accordingly.
   470  func (r *Registry) pollRequests(ctx context.Context) error {
   471  	var rows []tree.Datums
   472  	// Loop until we run the query without straddling an epoch increment.
   473  	for {
   474  		r.mu.Lock()
   475  		epoch := r.mu.epoch
   476  		r.mu.Unlock()
   477  
   478  		var err error
   479  		rows, err = r.ie.QueryEx(ctx, "stmt-diag-poll", nil, /* txn */
   480  			sqlbase.InternalExecutorSessionDataOverride{
   481  				User: security.RootUser,
   482  			},
   483  			"SELECT id, statement_fingerprint FROM system.statement_diagnostics_requests "+
   484  				"WHERE completed = false")
   485  		if err != nil {
   486  			return err
   487  		}
   488  
   489  		r.mu.Lock()
   490  		// If the epoch changed it means that a request was added to the registry
   491  		// manually while the query was running. In that case, if we were to process
   492  		// the query results normally, we might remove that manually-added request.
   493  		if r.mu.epoch != epoch {
   494  			r.mu.Unlock()
   495  			continue
   496  		}
   497  		break
   498  	}
   499  	defer r.mu.Unlock()
   500  
   501  	var ids util.FastIntSet
   502  	for _, row := range rows {
   503  		id := requestID(*row[0].(*tree.DInt))
   504  		fprint := string(*row[1].(*tree.DString))
   505  
   506  		ids.Add(int(id))
   507  		r.addRequestInternalLocked(ctx, id, fprint)
   508  	}
   509  
   510  	// Remove all other requests.
   511  	for id := range r.mu.requestFingerprints {
   512  		if !ids.Contains(int(id)) {
   513  			delete(r.mu.requestFingerprints, id)
   514  		}
   515  	}
   516  	return nil
   517  }
   518  
   519  // gossipNotification is called in response to a gossip update informing us that
   520  // we need to poll.
   521  func (r *Registry) gossipNotification(s string, value roachpb.Value) {
   522  	if s != gossip.KeyGossipStatementDiagnosticsRequest {
   523  		// We don't expect any other notifications. Perhaps in a future version we
   524  		// added other keys with the same prefix.
   525  		return
   526  	}
   527  	select {
   528  	case r.gossipUpdateChan <- requestID(binary.LittleEndian.Uint64(value.RawBytes)):
   529  	default:
   530  		// Don't pile up on these requests and don't block gossip.
   531  	}
   532  }