github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/disk_stall.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math/rand"
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    21  )
    22  
    23  func registerDiskStalledDetection(r *testRegistry) {
    24  	for _, affectsLogDir := range []bool{false, true} {
    25  		for _, affectsDataDir := range []bool{false, true} {
    26  			// Grab copies of the args because we'll pass them into a closure.
    27  			// Everyone's favorite bug to write in Go.
    28  			affectsLogDir := affectsLogDir
    29  			affectsDataDir := affectsDataDir
    30  			r.Add(testSpec{
    31  				Name: fmt.Sprintf(
    32  					"disk-stalled/log=%t,data=%t",
    33  					affectsLogDir, affectsDataDir,
    34  				),
    35  				Owner:      OwnerKV,
    36  				MinVersion: "v19.1.0",
    37  				Cluster:    makeClusterSpec(1),
    38  				Run: func(ctx context.Context, t *test, c *cluster) {
    39  					runDiskStalledDetection(ctx, t, c, affectsLogDir, affectsDataDir)
    40  				},
    41  			})
    42  		}
    43  	}
    44  }
    45  
    46  func runDiskStalledDetection(
    47  	ctx context.Context, t *test, c *cluster, affectsLogDir bool, affectsDataDir bool,
    48  ) {
    49  	n := c.Node(1)
    50  
    51  	c.Put(ctx, cockroach, "./cockroach")
    52  	c.Run(ctx, n, "sudo umount -f {store-dir}/faulty || true")
    53  	c.Run(ctx, n, "mkdir -p {store-dir}/{real,faulty} || true")
    54  	// Make sure the actual logs are downloaded as artifacts.
    55  	c.Run(ctx, n, "rm -f logs && ln -s {store-dir}/real/logs logs || true")
    56  
    57  	t.Status("setting up charybdefs")
    58  
    59  	if err := execCmd(ctx, t.l, roachprod, "install", c.makeNodes(n), "charybdefs"); err != nil {
    60  		t.Fatal(err)
    61  	}
    62  	c.Run(ctx, n, "sudo charybdefs {store-dir}/faulty -oallow_other,modules=subdir,subdir={store-dir}/real")
    63  	c.Run(ctx, n, "sudo mkdir -p {store-dir}/real/logs")
    64  	c.Run(ctx, n, "sudo chmod -R 777 {store-dir}/{real,faulty}")
    65  	l, err := t.l.ChildLogger("cockroach")
    66  	if err != nil {
    67  		t.Fatal(err)
    68  	}
    69  	type result struct {
    70  		err error
    71  		out string
    72  	}
    73  	errCh := make(chan result)
    74  
    75  	// NB: charybdefs' delay nemesis introduces 50ms per syscall. It would
    76  	// be nicer to introduce a longer delay, but this works.
    77  	tooShortSync := 40 * time.Millisecond
    78  
    79  	maxLogSync := time.Hour
    80  	logDir := "real/logs"
    81  	if affectsLogDir {
    82  		logDir = "faulty/logs"
    83  		maxLogSync = tooShortSync
    84  	}
    85  	maxDataSync := time.Hour
    86  	dataDir := "real"
    87  	if affectsDataDir {
    88  		maxDataSync = tooShortSync
    89  		dataDir = "faulty"
    90  	}
    91  
    92  	tStarted := timeutil.Now()
    93  	dur := 10 * time.Minute
    94  	if !affectsDataDir && !affectsLogDir {
    95  		dur = 30 * time.Second
    96  	}
    97  
    98  	go func() {
    99  		t.WorkerStatus("running server")
   100  		out, err := c.RunWithBuffer(ctx, l, n,
   101  			fmt.Sprintf("timeout --signal 9 %ds env COCKROACH_ENGINE_MAX_SYNC_DURATION_FATAL=true "+
   102  				"COCKROACH_ENGINE_MAX_SYNC_DURATION=%s COCKROACH_LOG_MAX_SYNC_DURATION=%s "+
   103  				"./cockroach start --insecure --logtostderr=INFO --store {store-dir}/%s --log-dir {store-dir}/%s",
   104  				int(dur.Seconds()), maxDataSync, maxLogSync, dataDir, logDir,
   105  			),
   106  		)
   107  		errCh <- result{err, string(out)}
   108  	}()
   109  
   110  	time.Sleep(time.Duration(rand.Intn(5)) * time.Second)
   111  
   112  	t.Status("blocking storage")
   113  	c.Run(ctx, n, "charybdefs-nemesis --delay")
   114  
   115  	res := <-errCh
   116  	if res.err == nil {
   117  		t.Fatalf("expected an error: %s", res.out)
   118  	}
   119  
   120  	// This test can also run in sanity check mode to make sure it doesn't fail
   121  	// due to the aggressive env vars above.
   122  	expectMsg := affectsDataDir || affectsLogDir
   123  
   124  	if expectMsg != strings.Contains(res.out, "disk stall detected") {
   125  		t.Fatalf("unexpected output: %v %s", res.err, res.out)
   126  	} else if elapsed := timeutil.Since(tStarted); !expectMsg && elapsed < dur {
   127  		t.Fatalf("no disk stall injected, but process terminated too early after %s (expected >= %s)", elapsed, dur)
   128  	}
   129  
   130  	c.Run(ctx, n, "charybdefs-nemesis --clear")
   131  	c.Run(ctx, n, "sudo umount {store-dir}/faulty")
   132  }