github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/cancel.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    19  )
    20  
    21  // Motivation:
    22  // Although there are unit tests that test query cancellation, they have been
    23  // insufficient in detecting problems with canceling long-running, multi-node
    24  // DistSQL queries. This is because, unlike local queries which only need to
    25  // cancel the transaction's context, DistSQL queries must cancel flow contexts
    26  // on each node involed in the query. Typical strategies for local execution
    27  // testing involve using a builtin like generate_series to create artificially
    28  // long-running queries, but these approaches don't create multi-node DistSQL
    29  // queries; the only way to do so is by querying a large dataset split across
    30  // multiple nodes. Due to the high cost of loading the pre-requisite data, these
    31  // tests are best suited as nightlies.
    32  //
    33  // Once DistSQL queries provide more testing knobs, these tests can likely be
    34  // replaced with unit tests.
    35  func registerCancel(r *testRegistry) {
    36  	runCancel := func(ctx context.Context, t *test, c *cluster,
    37  		queries []string, warehouses int, useDistsql bool) {
    38  		c.Put(ctx, cockroach, "./cockroach", c.All())
    39  		c.Put(ctx, workload, "./workload", c.All())
    40  		c.Start(ctx, t, c.All())
    41  
    42  		m := newMonitor(ctx, c, c.All())
    43  		m.Go(func(ctx context.Context) error {
    44  			t.Status("importing TPCC fixture")
    45  			c.Run(ctx, c.Node(1), fmt.Sprintf(
    46  				"./workload fixtures load tpcc --warehouses=%d {pgurl:1}", warehouses))
    47  
    48  			conn := c.Conn(ctx, 1)
    49  			defer conn.Close()
    50  
    51  			var queryPrefix string
    52  			if !useDistsql {
    53  				queryPrefix = "SET distsql = off;"
    54  			}
    55  
    56  			t.Status("running queries to cancel")
    57  			for _, q := range queries {
    58  				sem := make(chan struct{}, 1)
    59  				go func(q string) {
    60  					t.l.Printf("executing \"%s\"\n", q)
    61  					sem <- struct{}{}
    62  					_, err := conn.Exec(queryPrefix + q)
    63  					if err == nil {
    64  						close(sem)
    65  						t.Fatal("query completed before it could be canceled")
    66  					} else {
    67  						fmt.Printf("query failed with error: %s\n", err)
    68  					}
    69  					sem <- struct{}{}
    70  				}(q)
    71  
    72  				<-sem
    73  
    74  				// The cancel query races with the execution of the query it's trying to
    75  				// cancel, which may result in attempting to cancel the query before it
    76  				// has started.  To be more confident that the query is executing, wait
    77  				// a bit before attempting to cancel it.
    78  				time.Sleep(100 * time.Millisecond)
    79  
    80  				const cancelQuery = `CANCEL QUERIES
    81  	SELECT query_id FROM [SHOW CLUSTER QUERIES] WHERE query not like '%SHOW CLUSTER QUERIES%'`
    82  				c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "`+cancelQuery+`"`)
    83  				cancelStartTime := timeutil.Now()
    84  
    85  				select {
    86  				case _, ok := <-sem:
    87  					if !ok {
    88  						t.Fatal("query could not be canceled")
    89  					}
    90  					timeToCancel := timeutil.Now().Sub(cancelStartTime)
    91  					fmt.Printf("canceling \"%s\" took %s\n", q, timeToCancel)
    92  
    93  				case <-time.After(5 * time.Second):
    94  					t.Fatal("query took too long to respond to cancellation")
    95  				}
    96  			}
    97  
    98  			return nil
    99  		})
   100  		m.Wait()
   101  	}
   102  
   103  	const warehouses = 10
   104  	const numNodes = 3
   105  	queries := []string{
   106  		`SELECT * FROM tpcc.stock`,
   107  		`SELECT * FROM tpcc.stock WHERE s_quantity > 100`,
   108  		`SELECT s_i_id, sum(s_quantity) FROM tpcc.stock GROUP BY s_i_id`,
   109  		`SELECT * FROM tpcc.stock ORDER BY s_quantity`,
   110  		`SELECT * FROM tpcc.order_line JOIN tpcc.stock ON s_i_id=ol_i_id`,
   111  		`SELECT ol_number, sum(s_quantity) FROM tpcc.stock JOIN tpcc.order_line ON s_i_id=ol_i_id WHERE ol_number > 10 GROUP BY ol_number ORDER BY ol_number`,
   112  	}
   113  
   114  	r.Add(testSpec{
   115  		Name:    fmt.Sprintf("cancel/tpcc/distsql/w=%d,nodes=%d", warehouses, numNodes),
   116  		Owner:   OwnerSQLExec,
   117  		Cluster: makeClusterSpec(numNodes),
   118  		Run: func(ctx context.Context, t *test, c *cluster) {
   119  			runCancel(ctx, t, c, queries, warehouses, true /* useDistsql */)
   120  		},
   121  	})
   122  
   123  	r.Add(testSpec{
   124  		Name:    fmt.Sprintf("cancel/tpcc/local/w=%d,nodes=%d", warehouses, numNodes),
   125  		Owner:   OwnerSQLExec,
   126  		Cluster: makeClusterSpec(numNodes),
   127  		Run: func(ctx context.Context, t *test, c *cluster) {
   128  			runCancel(ctx, t, c, queries, warehouses, false /* useDistsql */)
   129  		},
   130  	})
   131  }