github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/backup.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"strings"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    20  	"github.com/cockroachdb/cockroach/pkg/util/version"
    21  	"github.com/cockroachdb/errors"
    22  )
    23  
    24  func registerBackup(r *testRegistry) {
    25  	backup2TBSpec := makeClusterSpec(10)
    26  	r.Add(testSpec{
    27  		Name:       fmt.Sprintf("backup2TB/%s", backup2TBSpec),
    28  		Owner:      OwnerBulkIO,
    29  		Cluster:    backup2TBSpec,
    30  		MinVersion: "v2.1.0",
    31  		Run: func(ctx context.Context, t *test, c *cluster) {
    32  			rows := 65104166
    33  			dest := c.name
    34  
    35  			if local {
    36  				rows = 100
    37  				dest += fmt.Sprintf("%d", timeutil.Now().UnixNano())
    38  			}
    39  
    40  			c.Put(ctx, workload, "./workload")
    41  			c.Put(ctx, cockroach, "./cockroach")
    42  
    43  			// NB: starting the cluster creates the logs dir as a side effect,
    44  			// needed below.
    45  			c.Start(ctx, t)
    46  			c.Run(ctx, c.All(), `./workload csv-server --port=8081 &> logs/workload-csv-server.log < /dev/null &`)
    47  			time.Sleep(time.Second) // wait for csv server to open listener
    48  
    49  			c.Run(ctx, c.Node(1), "./workload", "fixtures", "import", "bank",
    50  				"--db=bank", "--payload-bytes=10240", "--ranges=0", "--csv-server", "http://localhost:8081",
    51  				fmt.Sprintf("--rows=%d", rows), "--seed=1", "{pgurl:1}")
    52  
    53  			m := newMonitor(ctx, c)
    54  			m.Go(func(ctx context.Context) error {
    55  				t.Status(`running backup`)
    56  				c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
    57  				BACKUP bank.bank TO 'gs://cockroachdb-backup-testing/`+dest+`'"`)
    58  				return nil
    59  			})
    60  			m.Wait()
    61  		},
    62  	})
    63  
    64  	// backupTPCC continuously runs TPCC, takes a full backup after some time,
    65  	// and incremental after more time. It then restores the two backups and
    66  	// verifies them with a fingerprint.
    67  	r.Add(testSpec{
    68  		Name:    `backupTPCC`,
    69  		Owner:   OwnerBulkIO,
    70  		Cluster: makeClusterSpec(3),
    71  		Timeout: 1 * time.Hour,
    72  		Run: func(ctx context.Context, t *test, c *cluster) {
    73  			c.Put(ctx, cockroach, "./cockroach")
    74  			c.Put(ctx, workload, "./workload")
    75  			c.Start(ctx, t)
    76  			conn := c.Conn(ctx, 1)
    77  
    78  			duration := 5 * time.Minute
    79  			if local {
    80  				duration = 5 * time.Second
    81  			}
    82  			warehouses := 10
    83  
    84  			backupDir := "gs://cockroachdb-backup-testing/" + c.name
    85  			// Use inter-node file sharing on 20.1+.
    86  			if r.buildVersion.AtLeast(version.MustParse(`v20.1.0-0`)) {
    87  				backupDir = "nodelocal://1/" + c.name
    88  			}
    89  			fullDir := backupDir + "/full"
    90  			incDir := backupDir + "/inc"
    91  
    92  			t.Status(`workload initialization`)
    93  			cmd := fmt.Sprintf(
    94  				"./workload init tpcc --warehouses=%d {pgurl:1-%d}",
    95  				warehouses, c.spec.NodeCount,
    96  			)
    97  			c.Run(ctx, c.Node(1), cmd)
    98  
    99  			m := newMonitor(ctx, c)
   100  			m.Go(func(ctx context.Context) error {
   101  				_, err := conn.ExecContext(ctx, `
   102  					CREATE DATABASE restore_full;
   103  					CREATE DATABASE restore_inc;
   104  				`)
   105  				return err
   106  			})
   107  			m.Wait()
   108  
   109  			t.Status(`run tpcc`)
   110  			ctx, cancel := context.WithCancel(ctx)
   111  			defer cancel()
   112  
   113  			cmdDone := make(chan error)
   114  			go func() {
   115  				cmd := fmt.Sprintf(
   116  					"./workload run tpcc --warehouses=%d {pgurl:1-%d}",
   117  					warehouses, c.spec.NodeCount,
   118  				)
   119  
   120  				cmdDone <- c.RunE(ctx, c.Node(1), cmd)
   121  			}()
   122  
   123  			select {
   124  			case <-time.After(duration):
   125  			case <-ctx.Done():
   126  				return
   127  			}
   128  
   129  			t.Status(`full backup`)
   130  			// Use a time slightly in the past to avoid "cannot specify timestamp in the future" errors.
   131  			tFull := fmt.Sprint(timeutil.Now().Add(time.Second * -2).UnixNano())
   132  			m = newMonitor(ctx, c)
   133  			m.Go(func(ctx context.Context) error {
   134  				_, err := conn.ExecContext(ctx,
   135  					`BACKUP tpcc.* TO $1 AS OF SYSTEM TIME `+tFull,
   136  					fullDir,
   137  				)
   138  				return err
   139  			})
   140  			m.Wait()
   141  
   142  			t.Status(`continue tpcc`)
   143  			select {
   144  			case <-time.After(duration):
   145  			case <-ctx.Done():
   146  				return
   147  			}
   148  
   149  			t.Status(`incremental backup`)
   150  			tInc := fmt.Sprint(timeutil.Now().Add(time.Second * -2).UnixNano())
   151  			m = newMonitor(ctx, c)
   152  			m.Go(func(ctx context.Context) error {
   153  				_, err := conn.ExecContext(ctx,
   154  					`BACKUP tpcc.* TO $1 AS OF SYSTEM TIME `+tInc+` INCREMENTAL FROM $2`,
   155  					incDir,
   156  					fullDir,
   157  				)
   158  				if err != nil {
   159  					return err
   160  				}
   161  
   162  				// Backups are done, make sure workload is still running.
   163  				select {
   164  				case err := <-cmdDone:
   165  					// Workload exited before it should have.
   166  					return err
   167  				default:
   168  					return nil
   169  				}
   170  			})
   171  			m.Wait()
   172  
   173  			m = newMonitor(ctx, c)
   174  			m.Go(func(ctx context.Context) error {
   175  				t.Status(`restore full`)
   176  				if _, err := conn.ExecContext(ctx,
   177  					`RESTORE tpcc.* FROM $1 WITH into_db='restore_full'`,
   178  					fullDir,
   179  				); err != nil {
   180  					return err
   181  				}
   182  
   183  				t.Status(`restore incremental`)
   184  				if _, err := conn.ExecContext(ctx,
   185  					`RESTORE tpcc.* FROM $1, $2 WITH into_db='restore_inc'`,
   186  					fullDir,
   187  					incDir,
   188  				); err != nil {
   189  					return err
   190  				}
   191  
   192  				t.Status(`fingerprint`)
   193  				fingerprint := func(db string, asof string) (string, error) {
   194  					var b strings.Builder
   195  
   196  					var tables []string
   197  					rows, err := conn.QueryContext(
   198  						ctx,
   199  						fmt.Sprintf("SELECT table_name FROM [SHOW TABLES FROM %s] ORDER BY table_name", db),
   200  					)
   201  					if err != nil {
   202  						return "", err
   203  					}
   204  					defer rows.Close()
   205  					for rows.Next() {
   206  						var name string
   207  						if err := rows.Scan(&name); err != nil {
   208  							return "", err
   209  						}
   210  						tables = append(tables, name)
   211  					}
   212  
   213  					for _, table := range tables {
   214  						fmt.Fprintf(&b, "table %s\n", table)
   215  						query := fmt.Sprintf("SHOW EXPERIMENTAL_FINGERPRINTS FROM TABLE %s.%s", db, table)
   216  						if asof != "" {
   217  							query = fmt.Sprintf("SELECT * FROM [%s] AS OF SYSTEM TIME %s", query, asof)
   218  						}
   219  						rows, err = conn.QueryContext(ctx, query)
   220  						if err != nil {
   221  							return "", err
   222  						}
   223  						defer rows.Close()
   224  						for rows.Next() {
   225  							var name, fp string
   226  							if err := rows.Scan(&name, &fp); err != nil {
   227  								return "", err
   228  							}
   229  							fmt.Fprintf(&b, "%s: %s\n", name, fp)
   230  						}
   231  					}
   232  
   233  					return b.String(), rows.Err()
   234  				}
   235  
   236  				tpccFull, err := fingerprint("tpcc", tFull)
   237  				if err != nil {
   238  					return err
   239  				}
   240  				tpccInc, err := fingerprint("tpcc", tInc)
   241  				if err != nil {
   242  					return err
   243  				}
   244  				restoreFull, err := fingerprint("restore_full", "")
   245  				if err != nil {
   246  					return err
   247  				}
   248  				restoreInc, err := fingerprint("restore_inc", "")
   249  				if err != nil {
   250  					return err
   251  				}
   252  
   253  				if tpccFull != restoreFull {
   254  					return errors.Errorf("got %s, expected %s", restoreFull, tpccFull)
   255  				}
   256  				if tpccInc != restoreInc {
   257  					return errors.Errorf("got %s, expected %s", restoreInc, tpccInc)
   258  				}
   259  
   260  				return nil
   261  			})
   262  			m.Wait()
   263  		},
   264  	})
   265  }