github.com/tuhaihe/gpbackup@v1.0.3/end_to_end/locks_test.go (about)

     1  package end_to_end_test
     2  
     3  import (
     4  	"fmt"
     5  	"os/exec"
     6  	"time"
     7  
     8  	"github.com/tuhaihe/gp-common-go-libs/testhelper"
     9  	"github.com/tuhaihe/gpbackup/backup"
    10  	"github.com/tuhaihe/gpbackup/testutils"
    11  	. "github.com/onsi/ginkgo/v2"
    12  	. "github.com/onsi/gomega"
    13  )
    14  
    15  var _ = Describe("Deadlock handling", func() {
    16  	BeforeEach(func() {
    17  		end_to_end_setup()
    18  		testhelper.AssertQueryRuns(backupConn, "CREATE table bigtable(id int unique); INSERT INTO bigtable SELECT generate_series(1,1000000)")
    19  	})
    20  	AfterEach(func() {
    21  		end_to_end_teardown()
    22  		testhelper.AssertQueryRuns(backupConn, "DROP table bigtable")
    23  	})
    24  	It("runs gpbackup with jobs flag and COPY deadlock handling occurs", func() {
    25  		Skip("Cloudberry skip")
    26  		if useOldBackupVersion {
    27  			Skip("This test is not needed for old backup versions")
    28  		}
    29  		// Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts
    30  		// to grab AccessShareLocks before its metadata dump section.
    31  		backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE")
    32  
    33  		// Execute gpbackup with --jobs 10 since there are 10 tables to back up
    34  		args := []string{
    35  			"--dbname", "testdb",
    36  			"--backup-dir", backupDir,
    37  			"--jobs", "10",
    38  			"--verbose"}
    39  		cmd := exec.Command(gpbackupPath, args...)
    40  		// Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once
    41  		// that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its
    42  		// trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the
    43  		// beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump.
    44  		anotherConn := testutils.SetupTestDbConn("testdb")
    45  		defer anotherConn.Close()
    46  		go func() {
    47  			// Query to see if gpbackup's AccessShareLock request on public.foo is blocked
    48  			checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
    49  
    50  			// Wait up to 10 seconds for gpbackup to block
    51  			var gpbackupBlockedLockCount int
    52  			iterations := 100
    53  			for iterations > 0 {
    54  				_ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
    55  				if gpbackupBlockedLockCount < 1 {
    56  					time.Sleep(100 * time.Millisecond)
    57  					iterations--
    58  				} else {
    59  					break
    60  				}
    61  			}
    62  
    63  			// Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup
    64  			// during the trigger metadata dump so that the test can queue a bunch of
    65  			// AccessExclusiveLock requests against the test tables. Afterwards, release the
    66  			// AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump.
    67  			anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`)
    68  			backupConn.MustExec("COMMIT")
    69  		}()
    70  
    71  		// Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we
    72  		// see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM
    73  		// FULL) on all the test tables.
    74  		dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable",
    75  			"schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"}
    76  		for _, dataTable := range dataTables {
    77  			go func(dataTable string) {
    78  				accessExclusiveLockConn := testutils.SetupTestDbConn("testdb")
    79  				defer accessExclusiveLockConn.Close()
    80  
    81  				// Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked
    82  				checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
    83  
    84  				// Wait up to 10 seconds for gpbackup to block
    85  				var gpbackupBlockedLockCount int
    86  				iterations := 100
    87  				for iterations > 0 {
    88  					_ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
    89  					if gpbackupBlockedLockCount < 1 {
    90  						time.Sleep(100 * time.Millisecond)
    91  						iterations--
    92  					} else {
    93  						break
    94  					}
    95  				}
    96  
    97  				// Queue an AccessExclusiveLock request on a test table which will later
    98  				// result in a detected deadlock during the gpbackup data dump section.
    99  				accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable))
   100  			}(dataTable)
   101  		}
   102  
   103  		// Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block.
   104  		// Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock
   105  		// gpbackup and let gpbackup move forward to the data dump section.
   106  		var accessExclBlockedLockCount int
   107  		go func() {
   108  			// Query to check for ungranted AccessExclusiveLock requests on our test tables
   109  			checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'`
   110  
   111  			// Wait up to 10 seconds
   112  			iterations := 100
   113  			for iterations > 0 {
   114  				_ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery)
   115  				if accessExclBlockedLockCount < 10 {
   116  					time.Sleep(100 * time.Millisecond)
   117  					iterations--
   118  				} else {
   119  					break
   120  				}
   121  			}
   122  
   123  			// Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger
   124  			anotherConn.MustExec("COMMIT")
   125  		}()
   126  
   127  		// gpbackup has finished
   128  		output, _ := cmd.CombinedOutput()
   129  		stdout := string(output)
   130  
   131  		// Check that 10 deadlock traps were placed during the test
   132  		Expect(accessExclBlockedLockCount).To(Equal(0))
   133  		// No non-main worker should have been able to run COPY due to deadlock detection
   134  		for i := 1; i < 10; i++ {
   135  			expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i)
   136  			Expect(stdout).To(ContainSubstring(expectedLockString))
   137  
   138  			expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i)
   139  			Expect(stdout).To(ContainSubstring(expectedWarnString))
   140  
   141  			unexpectedCopyString := fmt.Sprintf("[DEBUG]:-Worker %d: COPY ", i)
   142  			Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString))
   143  		}
   144  
   145  		// Only the main worker thread, worker 0, will run COPY on all the test tables
   146  		for _, dataTable := range dataTables {
   147  			expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, dataTable)
   148  			Expect(stdout).To(ContainSubstring(expectedString))
   149  		}
   150  
   151  		Expect(stdout).To(ContainSubstring("Backup completed successfully"))
   152  	})
   153  	It("runs gpbackup with copy-queue-size flag and COPY deadlock handling occurs", func() {
   154  		Skip("Cloudberry skip")
   155  		if useOldBackupVersion {
   156  			Skip("This test is not needed for old backup versions")
   157  		}
   158  		// Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts
   159  		// to grab AccessShareLocks before its metadata dump section.
   160  		backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE")
   161  
   162  		// Execute gpbackup with --copy-queue-size 2
   163  		args := []string{
   164  			"--dbname", "testdb",
   165  			"--backup-dir", backupDir,
   166  			"--single-data-file",
   167  			"--copy-queue-size", "2",
   168  			"--verbose"}
   169  		cmd := exec.Command(gpbackupPath, args...)
   170  
   171  		// Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once
   172  		// that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its
   173  		// trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the
   174  		// beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump.
   175  		anotherConn := testutils.SetupTestDbConn("testdb")
   176  		defer anotherConn.Close()
   177  		go func() {
   178  			// Query to see if gpbackup's AccessShareLock request on public.foo is blocked
   179  			checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   180  
   181  			// Wait up to 10 seconds for gpbackup to block
   182  			var gpbackupBlockedLockCount int
   183  			iterations := 100
   184  			for iterations > 0 {
   185  				_ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   186  				if gpbackupBlockedLockCount < 1 {
   187  					time.Sleep(100 * time.Millisecond)
   188  					iterations--
   189  				} else {
   190  					break
   191  				}
   192  			}
   193  
   194  			// Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup
   195  			// during the trigger metadata dump so that the test can queue a bunch of
   196  			// AccessExclusiveLock requests against the test tables. Afterwards, release the
   197  			// AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump.
   198  			anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`)
   199  			backupConn.MustExec("COMMIT")
   200  		}()
   201  
   202  		// Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we
   203  		// see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM
   204  		// FULL) on all the test tables.
   205  		dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable",
   206  			"schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"}
   207  		for _, dataTable := range dataTables {
   208  			go func(dataTable string) {
   209  				accessExclusiveLockConn := testutils.SetupTestDbConn("testdb")
   210  				defer accessExclusiveLockConn.Close()
   211  
   212  				// Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked
   213  				checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   214  
   215  				// Wait up to 10 seconds for gpbackup to block
   216  				var gpbackupBlockedLockCount int
   217  				iterations := 100
   218  				for iterations > 0 {
   219  					_ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   220  					if gpbackupBlockedLockCount < 1 {
   221  						time.Sleep(100 * time.Millisecond)
   222  						iterations--
   223  					} else {
   224  						break
   225  					}
   226  				}
   227  
   228  				// Queue an AccessExclusiveLock request on a test table which will later
   229  				// result in a detected deadlock during the gpbackup data dump section.
   230  				accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable))
   231  			}(dataTable)
   232  		}
   233  
   234  		// Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block.
   235  		// Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock
   236  		// gpbackup and let gpbackup move forward to the data dump section.
   237  		var accessExclBlockedLockCount int
   238  		go func() {
   239  			Skip("Cloudberry skip")
   240  			// Query to check for ungranted AccessExclusiveLock requests on our test tables
   241  			checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'`
   242  
   243  			// Wait up to 10 seconds
   244  			iterations := 100
   245  			for iterations > 0 {
   246  				_ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery)
   247  				if accessExclBlockedLockCount < 10 {
   248  					time.Sleep(100 * time.Millisecond)
   249  					iterations--
   250  				} else {
   251  					break
   252  				}
   253  			}
   254  
   255  			// Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger
   256  			anotherConn.MustExec("COMMIT")
   257  		}()
   258  
   259  		// gpbackup has finished
   260  		output, _ := cmd.CombinedOutput()
   261  		stdout := string(output)
   262  
   263  		// Check that 10 deadlock traps were placed during the test
   264  		Expect(accessExclBlockedLockCount).To(Equal(10))
   265  		// No non-main worker should have been able to run COPY due to deadlock detection
   266  		for i := 1; i < 2; i++ {
   267  			expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i)
   268  			Expect(stdout).To(ContainSubstring(expectedLockString))
   269  
   270  			expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i)
   271  			Expect(stdout).To(ContainSubstring(expectedWarnString))
   272  
   273  			unexpectedCopyString := fmt.Sprintf("[DEBUG]:-Worker %d: COPY ", i)
   274  			Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString))
   275  
   276  			expectedLockString = fmt.Sprintf(`Locks held on table %s`, dataTables[i])
   277  			Expect(stdout).To(ContainSubstring(expectedLockString))
   278  
   279  			Expect(stdout).To(ContainSubstring(`"Mode":"AccessExclusiveLock"`))
   280  		}
   281  
   282  		// Only the main worker thread, worker 0, will run COPY on all the test tables
   283  		for _, dataTable := range dataTables {
   284  			expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, dataTable)
   285  			Expect(stdout).To(ContainSubstring(expectedString))
   286  		}
   287  
   288  		Expect(stdout).To(ContainSubstring("Backup completed successfully"))
   289  	})
   290  	It("runs gpbackup and defers 2 deadlocked tables to main worker", func() {
   291  		if true {
   292  			Skip(fmt.Sprintf("This test is not needed for old backup versions or GPDB versions < %s", backup.SNAPSHOT_GPDB_MIN_VERSION))
   293  		}
   294  		// Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts
   295  		// to grab AccessShareLocks before its metadata dump section.
   296  		backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE")
   297  
   298  		args := []string{
   299  			"--dbname", "testdb",
   300  			"--backup-dir", backupDir,
   301  			"--jobs", "2",
   302  			"--verbose"}
   303  		cmd := exec.Command(gpbackupPath, args...)
   304  		// Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once
   305  		// that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its
   306  		// trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the
   307  		// beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump.
   308  		anotherConn := testutils.SetupTestDbConn("testdb")
   309  		defer anotherConn.Close()
   310  		go func() {
   311  			// Query to see if gpbackup's AccessShareLock request on public.foo is blocked
   312  			checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   313  
   314  			// Wait up to 10 seconds for gpbackup to block
   315  			var gpbackupBlockedLockCount int
   316  			iterations := 100
   317  			for iterations > 0 {
   318  				_ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   319  				if gpbackupBlockedLockCount < 1 {
   320  					time.Sleep(100 * time.Millisecond)
   321  					iterations--
   322  				} else {
   323  					break
   324  				}
   325  			}
   326  
   327  			// Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup
   328  			// during the trigger metadata dump so that the test can queue a bunch of
   329  			// AccessExclusiveLock requests against the test tables. Afterwards, release the
   330  			// AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump.
   331  			anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`)
   332  			backupConn.MustExec("COMMIT")
   333  		}()
   334  
   335  		// Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we
   336  		// see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM
   337  		// FULL) on two of the test tables.
   338  		dataTables := []string{"public.holds", "public.sales", "public.bigtable",
   339  			"schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"}
   340  		lockedTables := []string{`public."FOObar"`, "public.foo"}
   341  		for _, lockedTable := range lockedTables {
   342  			go func(lockedTable string) {
   343  				accessExclusiveLockConn := testutils.SetupTestDbConn("testdb")
   344  				defer accessExclusiveLockConn.Close()
   345  
   346  				// Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked
   347  				checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'`
   348  
   349  				// Wait up to 10 seconds for gpbackup to block
   350  				var gpbackupBlockedLockCount int
   351  				iterations := 100
   352  				for iterations > 0 {
   353  					_ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery)
   354  					if gpbackupBlockedLockCount < 1 {
   355  						time.Sleep(100 * time.Millisecond)
   356  						iterations--
   357  					} else {
   358  						break
   359  					}
   360  				}
   361  				// Queue an AccessExclusiveLock request on a test table which will later
   362  				// result in a detected deadlock during the gpbackup data dump section.
   363  				accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, lockedTable))
   364  			}(lockedTable)
   365  		}
   366  
   367  		// Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block.
   368  		// Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock
   369  		// gpbackup and let gpbackup move forward to the data dump section.
   370  		var accessExclBlockedLockCount int
   371  		go func() {
   372  			// Query to check for ungranted AccessExclusiveLock requests on our test tables
   373  			checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'`
   374  
   375  			// Wait up to 10 seconds
   376  			iterations := 100
   377  			for iterations > 0 {
   378  				_ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery)
   379  				if accessExclBlockedLockCount < 9 {
   380  					time.Sleep(100 * time.Millisecond)
   381  					iterations--
   382  				} else {
   383  					break
   384  				}
   385  			}
   386  
   387  			// Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger
   388  			anotherConn.MustExec("COMMIT")
   389  		}()
   390  
   391  		// gpbackup has finished
   392  		output, _ := cmd.CombinedOutput()
   393  		stdout := string(output)
   394  
   395  		// Check that 2 deadlock traps were placed during the test
   396  		Expect(accessExclBlockedLockCount).To(Equal(2))
   397  		// No non-main worker should have been able to run COPY due to deadlock detection
   398  		for i := 1; i < backupConn.NumConns; i++ {
   399  			expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i)
   400  			Expect(stdout).To(ContainSubstring(expectedLockString))
   401  
   402  			expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i)
   403  			Expect(stdout).To(ContainSubstring(expectedWarnString))
   404  
   405  			unexpectedCopyString := fmt.Sprintf("[DEBUG]:-Worker %d: COPY ", i)
   406  			Expect(stdout).To(ContainSubstring(unexpectedCopyString))
   407  		}
   408  
   409  		// Only the main worker thread, worker 0, will run COPY on the 2 locked test tables
   410  		for _, lockedTable := range lockedTables {
   411  			expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, lockedTable)
   412  			Expect(stdout).To(ContainSubstring(expectedString))
   413  		}
   414  		for _, dataTable := range dataTables {
   415  			unexpectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, dataTable)
   416  			Expect(stdout).ToNot(ContainSubstring(unexpectedString))
   417  		}
   418  		Expect(stdout).To(ContainSubstring("Backup completed successfully"))
   419  	})
   420  })