github.com/greenplum-db/gpbackup@v0.0.0-20240517212602-89daab1885b3/end_to_end/locks_test.go (about)

     1  package end_to_end_test
     2  
     3  import (
     4  	"fmt"
     5  	"os/exec"
     6  	"time"
     7  
     8  	"github.com/greenplum-db/gp-common-go-libs/testhelper"
     9  	"github.com/greenplum-db/gpbackup/backup"
    10  	"github.com/greenplum-db/gpbackup/testutils"
    11  	. "github.com/onsi/ginkgo/v2"
    12  	. "github.com/onsi/gomega"
    13  )
    14  
    15  var _ = Describe("Deadlock handling", func() {
    16  	BeforeEach(func() {
    17  		end_to_end_setup()
    18  		testhelper.AssertQueryRuns(backupConn, "CREATE table bigtable(id int unique); INSERT INTO bigtable SELECT generate_series(1,1000000)")
    19  	})
    20  	AfterEach(func() {
    21  		end_to_end_teardown()
    22  		testhelper.AssertQueryRuns(backupConn, "DROP table bigtable")
    23  	})
    24  	It("runs gpbackup with jobs flag and COPY deadlock handling occurs", func() {
    25  		if useOldBackupVersion {
    26  			Skip("This test is not needed for old backup versions")
    27  		}
    28  		// Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts
    29  		// to grab AccessShareLocks before its metadata dump section.
    30  		backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE")
    31  
    32  		// Execute gpbackup with --jobs 10 since there are 10 tables to back up
    33  		args := []string{
    34  			"--dbname", "testdb",
    35  			"--backup-dir", backupDir,
    36  			"--jobs", "10",
    37  			"--verbose"}
    38  		cmd := exec.Command(gpbackupPath, args...)
    39  		// Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once
    40  		// that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its
    41  		// trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the
    42  		// beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump.
    43  		anotherConn := testutils.SetupTestDbConn("testdb")
    44  		defer anotherConn.Close()
    45  		go func() {
    46  			// Query to see if gpbackup's AccessShareLock request on public.foo is blocked
    47  			checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
    48  
    49  			// Wait up to 10 seconds for gpbackup to block
    50  			var gpbackupBlockedLockCount int
    51  			iterations := 100
    52  			for iterations > 0 {
    53  				_ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
    54  				if gpbackupBlockedLockCount < 1 {
    55  					time.Sleep(100 * time.Millisecond)
    56  					iterations--
    57  				} else {
    58  					break
    59  				}
    60  			}
    61  
    62  			// Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup
    63  			// during the trigger metadata dump so that the test can queue a bunch of
    64  			// AccessExclusiveLock requests against the test tables. Afterwards, release the
    65  			// AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump.
    66  			anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`)
    67  			backupConn.MustExec("COMMIT")
    68  		}()
    69  
    70  		// Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we
    71  		// see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM
    72  		// FULL) on all the test tables.
    73  		dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable",
    74  			"schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"}
    75  		for _, dataTable := range dataTables {
    76  			go func(dataTable string) {
    77  				accessExclusiveLockConn := testutils.SetupTestDbConn("testdb")
    78  				defer accessExclusiveLockConn.Close()
    79  
    80  				// Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked
    81  				checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
    82  
    83  				// Wait up to 10 seconds for gpbackup to block
    84  				var gpbackupBlockedLockCount int
    85  				iterations := 100
    86  				for iterations > 0 {
    87  					_ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
    88  					if gpbackupBlockedLockCount < 1 {
    89  						time.Sleep(100 * time.Millisecond)
    90  						iterations--
    91  					} else {
    92  						break
    93  					}
    94  				}
    95  
    96  				// Queue an AccessExclusiveLock request on a test table which will later
    97  				// result in a detected deadlock during the gpbackup data dump section.
    98  				accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable))
    99  			}(dataTable)
   100  		}
   101  
   102  		// Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block.
   103  		// Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock
   104  		// gpbackup and let gpbackup move forward to the data dump section.
   105  		var accessExclBlockedLockCount int
   106  		go func() {
   107  			// Query to check for ungranted AccessExclusiveLock requests on our test tables
   108  			checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'`
   109  
   110  			// Wait up to 10 seconds
   111  			iterations := 100
   112  			for iterations > 0 {
   113  				_ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery)
   114  				if accessExclBlockedLockCount < 10 {
   115  					time.Sleep(100 * time.Millisecond)
   116  					iterations--
   117  				} else {
   118  					break
   119  				}
   120  			}
   121  
   122  			// Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger
   123  			anotherConn.MustExec("COMMIT")
   124  		}()
   125  
   126  		// gpbackup has finished
   127  		output, _ := cmd.CombinedOutput()
   128  		stdout := string(output)
   129  
   130  		// Check that 10 deadlock traps were placed during the test
   131  		Expect(accessExclBlockedLockCount).To(Equal(10))
   132  		// No non-main worker should have been able to run COPY due to deadlock detection
   133  		for i := 1; i < 10; i++ {
   134  			expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i)
   135  			Expect(stdout).To(ContainSubstring(expectedLockString))
   136  
   137  			expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i)
   138  			Expect(stdout).To(ContainSubstring(expectedWarnString))
   139  
   140  			unexpectedCopyString := fmt.Sprintf(`[DEBUG]:-Worker %d: Executing "COPY `, i)
   141  			Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString))
   142  		}
   143  
   144  		// Only the main worker thread, worker 0, will run COPY on all the test tables
   145  		for _, dataTable := range dataTables {
   146  			expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, dataTable)
   147  			Expect(stdout).To(ContainSubstring(expectedString))
   148  		}
   149  
   150  		Expect(stdout).To(ContainSubstring("Backup completed successfully"))
   151  	})
   152  	It("runs gpbackup with copy-queue-size flag and COPY deadlock handling occurs", func() {
   153  		if useOldBackupVersion {
   154  			Skip("This test is not needed for old backup versions")
   155  		}
   156  		// Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts
   157  		// to grab AccessShareLocks before its metadata dump section.
   158  		backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE")
   159  
   160  		// Execute gpbackup with --copy-queue-size 2
   161  		args := []string{
   162  			"--dbname", "testdb",
   163  			"--backup-dir", backupDir,
   164  			"--single-data-file",
   165  			"--copy-queue-size", "2",
   166  			"--verbose"}
   167  		cmd := exec.Command(gpbackupPath, args...)
   168  
   169  		// Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once
   170  		// that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its
   171  		// trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the
   172  		// beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump.
   173  		anotherConn := testutils.SetupTestDbConn("testdb")
   174  		defer anotherConn.Close()
   175  		go func() {
   176  			// Query to see if gpbackup's AccessShareLock request on public.foo is blocked
   177  			checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   178  
   179  			// Wait up to 10 seconds for gpbackup to block
   180  			var gpbackupBlockedLockCount int
   181  			iterations := 100
   182  			for iterations > 0 {
   183  				_ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   184  				if gpbackupBlockedLockCount < 1 {
   185  					time.Sleep(100 * time.Millisecond)
   186  					iterations--
   187  				} else {
   188  					break
   189  				}
   190  			}
   191  
   192  			// Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup
   193  			// during the trigger metadata dump so that the test can queue a bunch of
   194  			// AccessExclusiveLock requests against the test tables. Afterwards, release the
   195  			// AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump.
   196  			anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`)
   197  			backupConn.MustExec("COMMIT")
   198  		}()
   199  
   200  		// Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we
   201  		// see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM
   202  		// FULL) on all the test tables.
   203  		dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable",
   204  			"schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"}
   205  		for _, dataTable := range dataTables {
   206  			go func(dataTable string) {
   207  				accessExclusiveLockConn := testutils.SetupTestDbConn("testdb")
   208  				defer accessExclusiveLockConn.Close()
   209  
   210  				// Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked
   211  				checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   212  
   213  				// Wait up to 10 seconds for gpbackup to block
   214  				var gpbackupBlockedLockCount int
   215  				iterations := 100
   216  				for iterations > 0 {
   217  					_ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   218  					if gpbackupBlockedLockCount < 1 {
   219  						time.Sleep(100 * time.Millisecond)
   220  						iterations--
   221  					} else {
   222  						break
   223  					}
   224  				}
   225  
   226  				// Queue an AccessExclusiveLock request on a test table which will later
   227  				// result in a detected deadlock during the gpbackup data dump section.
   228  				accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable))
   229  			}(dataTable)
   230  		}
   231  
   232  		// Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block.
   233  		// Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock
   234  		// gpbackup and let gpbackup move forward to the data dump section.
   235  		var accessExclBlockedLockCount int
   236  		go func() {
   237  			// Query to check for ungranted AccessExclusiveLock requests on our test tables
   238  			checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'`
   239  
   240  			// Wait up to 10 seconds
   241  			iterations := 100
   242  			for iterations > 0 {
   243  				_ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery)
   244  				if accessExclBlockedLockCount < 10 {
   245  					time.Sleep(100 * time.Millisecond)
   246  					iterations--
   247  				} else {
   248  					break
   249  				}
   250  			}
   251  
   252  			// Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger
   253  			anotherConn.MustExec("COMMIT")
   254  		}()
   255  
   256  		// gpbackup has finished
   257  		output, _ := cmd.CombinedOutput()
   258  		stdout := string(output)
   259  
   260  		// Check that 10 deadlock traps were placed during the test
   261  		Expect(accessExclBlockedLockCount).To(Equal(10))
   262  		// No non-main worker should have been able to run COPY due to deadlock detection
   263  		for i := 1; i < 2; i++ {
   264  			expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i)
   265  			Expect(stdout).To(ContainSubstring(expectedLockString))
   266  
   267  			expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i)
   268  			Expect(stdout).To(ContainSubstring(expectedWarnString))
   269  
   270  			unexpectedCopyString := fmt.Sprintf(`[DEBUG]:-Worker %d: Executing "COPY `, i)
   271  			Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString))
   272  
   273  			expectedLockString = fmt.Sprintf(`Locks held on table %s`, dataTables[i])
   274  			Expect(stdout).To(ContainSubstring(expectedLockString))
   275  
   276  			Expect(stdout).To(ContainSubstring(`"Mode":"AccessExclusiveLock"`))
   277  		}
   278  
   279  		// Only the main worker thread, worker 0, will run COPY on all the test tables
   280  		for _, dataTable := range dataTables {
   281  			expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, dataTable)
   282  			Expect(stdout).To(ContainSubstring(expectedString))
   283  		}
   284  
   285  		Expect(stdout).To(ContainSubstring("Backup completed successfully"))
   286  	})
   287  	It("runs gpbackup and defers 2 deadlocked tables to main worker", func() {
   288  		if useOldBackupVersion || backupConn.Version.Before(backup.SNAPSHOT_GPDB_MIN_VERSION) {
   289  			Skip(fmt.Sprintf("This test is not needed for old backup versions or GPDB versions < %s", backup.SNAPSHOT_GPDB_MIN_VERSION))
   290  		}
   291  		// Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts
   292  		// to grab AccessShareLocks before its metadata dump section.
   293  		backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE")
   294  
   295  		args := []string{
   296  			"--dbname", "testdb",
   297  			"--backup-dir", backupDir,
   298  			"--jobs", "2",
   299  			"--verbose"}
   300  		cmd := exec.Command(gpbackupPath, args...)
   301  		// Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once
   302  		// that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its
   303  		// trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the
   304  		// beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump.
   305  		anotherConn := testutils.SetupTestDbConn("testdb")
   306  		defer anotherConn.Close()
   307  		go func() {
   308  			// Query to see if gpbackup's AccessShareLock request on public.foo is blocked
   309  			checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   310  
   311  			// Wait up to 10 seconds for gpbackup to block
   312  			var gpbackupBlockedLockCount int
   313  			iterations := 100
   314  			for iterations > 0 {
   315  				_ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   316  				if gpbackupBlockedLockCount < 1 {
   317  					time.Sleep(100 * time.Millisecond)
   318  					iterations--
   319  				} else {
   320  					break
   321  				}
   322  			}
   323  
   324  			// Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup
   325  			// during the trigger metadata dump so that the test can queue a bunch of
   326  			// AccessExclusiveLock requests against the test tables. Afterwards, release the
   327  			// AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump.
   328  			anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`)
   329  			backupConn.MustExec("COMMIT")
   330  		}()
   331  
   332  		// Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we
   333  		// see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM
   334  		// FULL) on two of the test tables.
   335  		dataTables := []string{"public.holds", "public.sales", "public.bigtable",
   336  			"schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"}
   337  		lockedTables := []string{`public."FOObar"`, "public.foo"}
   338  		for _, lockedTable := range lockedTables {
   339  			go func(lockedTable string) {
   340  				accessExclusiveLockConn := testutils.SetupTestDbConn("testdb")
   341  				defer accessExclusiveLockConn.Close()
   342  
   343  				// Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked
   344  				checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   345  
   346  				// Wait up to 10 seconds for gpbackup to block
   347  				var gpbackupBlockedLockCount int
   348  				iterations := 100
   349  				for iterations > 0 {
   350  					_ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   351  					if gpbackupBlockedLockCount < 1 {
   352  						time.Sleep(100 * time.Millisecond)
   353  						iterations--
   354  					} else {
   355  						break
   356  					}
   357  				}
   358  				// Queue an AccessExclusiveLock request on a test table which will later
   359  				// result in a detected deadlock during the gpbackup data dump section.
   360  				accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, lockedTable))
   361  			}(lockedTable)
   362  		}
   363  
   364  		// Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block.
   365  		// Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock
   366  		// gpbackup and let gpbackup move forward to the data dump section.
   367  		var accessExclBlockedLockCount int
   368  		go func() {
   369  			// Query to check for ungranted AccessExclusiveLock requests on our test tables
   370  			checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'`
   371  
   372  			// Wait up to 10 seconds
   373  			iterations := 100
   374  			for iterations > 0 {
   375  				_ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery)
   376  				if accessExclBlockedLockCount < 9 {
   377  					time.Sleep(100 * time.Millisecond)
   378  					iterations--
   379  				} else {
   380  					break
   381  				}
   382  			}
   383  
   384  			// Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger
   385  			anotherConn.MustExec("COMMIT")
   386  		}()
   387  
   388  		// gpbackup has finished
   389  		output, _ := cmd.CombinedOutput()
   390  		stdout := string(output)
   391  
   392  		// Check that 2 deadlock traps were placed during the test
   393  		Expect(accessExclBlockedLockCount).To(Equal(2))
   394  		// No non-main worker should have been able to run COPY due to deadlock detection
   395  		for i := 1; i < backupConn.NumConns; i++ {
   396  			expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i)
   397  			Expect(stdout).To(ContainSubstring(expectedLockString))
   398  
   399  			expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i)
   400  			Expect(stdout).To(ContainSubstring(expectedWarnString))
   401  
   402  			unexpectedCopyString := fmt.Sprintf(`[DEBUG]:-Worker %d: Executing "COPY `, i)
   403  			Expect(stdout).To(ContainSubstring(unexpectedCopyString))
   404  		}
   405  
   406  		// Only the main worker thread, worker 0, will run COPY on the 2 locked test tables
   407  		for _, lockedTable := range lockedTables {
   408  			expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, lockedTable)
   409  			Expect(stdout).To(ContainSubstring(expectedString))
   410  		}
   411  		for _, dataTable := range dataTables {
   412  			unexpectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, dataTable)
   413  			Expect(stdout).ToNot(ContainSubstring(unexpectedString))
   414  		}
   415  		Expect(stdout).To(ContainSubstring("Backup completed successfully"))
   416  	})
   417  })