github.com/greenplum-db/gpbackup@v0.0.0-20240517212602-89daab1885b3/end_to_end/locks_test.go (about) 1 package end_to_end_test 2 3 import ( 4 "fmt" 5 "os/exec" 6 "time" 7 8 "github.com/greenplum-db/gp-common-go-libs/testhelper" 9 "github.com/greenplum-db/gpbackup/backup" 10 "github.com/greenplum-db/gpbackup/testutils" 11 . "github.com/onsi/ginkgo/v2" 12 . "github.com/onsi/gomega" 13 ) 14 15 var _ = Describe("Deadlock handling", func() { 16 BeforeEach(func() { 17 end_to_end_setup() 18 testhelper.AssertQueryRuns(backupConn, "CREATE table bigtable(id int unique); INSERT INTO bigtable SELECT generate_series(1,1000000)") 19 }) 20 AfterEach(func() { 21 end_to_end_teardown() 22 testhelper.AssertQueryRuns(backupConn, "DROP table bigtable") 23 }) 24 It("runs gpbackup with jobs flag and COPY deadlock handling occurs", func() { 25 if useOldBackupVersion { 26 Skip("This test is not needed for old backup versions") 27 } 28 // Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts 29 // to grab AccessShareLocks before its metadata dump section. 30 backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE") 31 32 // Execute gpbackup with --jobs 10 since there are 10 tables to back up 33 args := []string{ 34 "--dbname", "testdb", 35 "--backup-dir", backupDir, 36 "--jobs", "10", 37 "--verbose"} 38 cmd := exec.Command(gpbackupPath, args...) 39 // Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once 40 // that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its 41 // trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the 42 // beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump. 43 anotherConn := testutils.SetupTestDbConn("testdb") 44 defer anotherConn.Close() 45 go func() { 46 // Query to see if gpbackup's AccessShareLock request on public.foo is blocked 47 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 48 49 // Wait up to 10 seconds for gpbackup to block 50 var gpbackupBlockedLockCount int 51 iterations := 100 52 for iterations > 0 { 53 _ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 54 if gpbackupBlockedLockCount < 1 { 55 time.Sleep(100 * time.Millisecond) 56 iterations-- 57 } else { 58 break 59 } 60 } 61 62 // Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup 63 // during the trigger metadata dump so that the test can queue a bunch of 64 // AccessExclusiveLock requests against the test tables. Afterwards, release the 65 // AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump. 66 anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`) 67 backupConn.MustExec("COMMIT") 68 }() 69 70 // Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we 71 // see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM 72 // FULL) on all the test tables. 73 dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable", 74 "schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"} 75 for _, dataTable := range dataTables { 76 go func(dataTable string) { 77 accessExclusiveLockConn := testutils.SetupTestDbConn("testdb") 78 defer accessExclusiveLockConn.Close() 79 80 // Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked 81 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 82 83 // Wait up to 10 seconds for gpbackup to block 84 var gpbackupBlockedLockCount int 85 iterations := 100 86 for iterations > 0 { 87 _ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 88 if gpbackupBlockedLockCount < 1 { 89 time.Sleep(100 * time.Millisecond) 90 iterations-- 91 } else { 92 break 93 } 94 } 95 96 // Queue an AccessExclusiveLock request on a test table which will later 97 // result in a detected deadlock during the gpbackup data dump section. 98 accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable)) 99 }(dataTable) 100 } 101 102 // Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block. 103 // Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock 104 // gpbackup and let gpbackup move forward to the data dump section. 105 var accessExclBlockedLockCount int 106 go func() { 107 // Query to check for ungranted AccessExclusiveLock requests on our test tables 108 checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'` 109 110 // Wait up to 10 seconds 111 iterations := 100 112 for iterations > 0 { 113 _ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery) 114 if accessExclBlockedLockCount < 10 { 115 time.Sleep(100 * time.Millisecond) 116 iterations-- 117 } else { 118 break 119 } 120 } 121 122 // Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger 123 anotherConn.MustExec("COMMIT") 124 }() 125 126 // gpbackup has finished 127 output, _ := cmd.CombinedOutput() 128 stdout := string(output) 129 130 // Check that 10 deadlock traps were placed during the test 131 Expect(accessExclBlockedLockCount).To(Equal(10)) 132 // No non-main worker should have been able to run COPY due to deadlock detection 133 for i := 1; i < 10; i++ { 134 expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i) 135 Expect(stdout).To(ContainSubstring(expectedLockString)) 136 137 expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i) 138 Expect(stdout).To(ContainSubstring(expectedWarnString)) 139 140 unexpectedCopyString := fmt.Sprintf(`[DEBUG]:-Worker %d: Executing "COPY `, i) 141 Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString)) 142 } 143 144 // Only the main worker thread, worker 0, will run COPY on all the test tables 145 for _, dataTable := range dataTables { 146 expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, dataTable) 147 Expect(stdout).To(ContainSubstring(expectedString)) 148 } 149 150 Expect(stdout).To(ContainSubstring("Backup completed successfully")) 151 }) 152 It("runs gpbackup with copy-queue-size flag and COPY deadlock handling occurs", func() { 153 if useOldBackupVersion { 154 Skip("This test is not needed for old backup versions") 155 } 156 // Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts 157 // to grab AccessShareLocks before its metadata dump section. 158 backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE") 159 160 // Execute gpbackup with --copy-queue-size 2 161 args := []string{ 162 "--dbname", "testdb", 163 "--backup-dir", backupDir, 164 "--single-data-file", 165 "--copy-queue-size", "2", 166 "--verbose"} 167 cmd := exec.Command(gpbackupPath, args...) 168 169 // Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once 170 // that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its 171 // trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the 172 // beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump. 173 anotherConn := testutils.SetupTestDbConn("testdb") 174 defer anotherConn.Close() 175 go func() { 176 // Query to see if gpbackup's AccessShareLock request on public.foo is blocked 177 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 178 179 // Wait up to 10 seconds for gpbackup to block 180 var gpbackupBlockedLockCount int 181 iterations := 100 182 for iterations > 0 { 183 _ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 184 if gpbackupBlockedLockCount < 1 { 185 time.Sleep(100 * time.Millisecond) 186 iterations-- 187 } else { 188 break 189 } 190 } 191 192 // Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup 193 // during the trigger metadata dump so that the test can queue a bunch of 194 // AccessExclusiveLock requests against the test tables. Afterwards, release the 195 // AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump. 196 anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`) 197 backupConn.MustExec("COMMIT") 198 }() 199 200 // Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we 201 // see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM 202 // FULL) on all the test tables. 203 dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable", 204 "schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"} 205 for _, dataTable := range dataTables { 206 go func(dataTable string) { 207 accessExclusiveLockConn := testutils.SetupTestDbConn("testdb") 208 defer accessExclusiveLockConn.Close() 209 210 // Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked 211 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 212 213 // Wait up to 10 seconds for gpbackup to block 214 var gpbackupBlockedLockCount int 215 iterations := 100 216 for iterations > 0 { 217 _ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 218 if gpbackupBlockedLockCount < 1 { 219 time.Sleep(100 * time.Millisecond) 220 iterations-- 221 } else { 222 break 223 } 224 } 225 226 // Queue an AccessExclusiveLock request on a test table which will later 227 // result in a detected deadlock during the gpbackup data dump section. 228 accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable)) 229 }(dataTable) 230 } 231 232 // Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block. 233 // Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock 234 // gpbackup and let gpbackup move forward to the data dump section. 235 var accessExclBlockedLockCount int 236 go func() { 237 // Query to check for ungranted AccessExclusiveLock requests on our test tables 238 checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'` 239 240 // Wait up to 10 seconds 241 iterations := 100 242 for iterations > 0 { 243 _ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery) 244 if accessExclBlockedLockCount < 10 { 245 time.Sleep(100 * time.Millisecond) 246 iterations-- 247 } else { 248 break 249 } 250 } 251 252 // Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger 253 anotherConn.MustExec("COMMIT") 254 }() 255 256 // gpbackup has finished 257 output, _ := cmd.CombinedOutput() 258 stdout := string(output) 259 260 // Check that 10 deadlock traps were placed during the test 261 Expect(accessExclBlockedLockCount).To(Equal(10)) 262 // No non-main worker should have been able to run COPY due to deadlock detection 263 for i := 1; i < 2; i++ { 264 expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i) 265 Expect(stdout).To(ContainSubstring(expectedLockString)) 266 267 expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i) 268 Expect(stdout).To(ContainSubstring(expectedWarnString)) 269 270 unexpectedCopyString := fmt.Sprintf(`[DEBUG]:-Worker %d: Executing "COPY `, i) 271 Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString)) 272 273 expectedLockString = fmt.Sprintf(`Locks held on table %s`, dataTables[i]) 274 Expect(stdout).To(ContainSubstring(expectedLockString)) 275 276 Expect(stdout).To(ContainSubstring(`"Mode":"AccessExclusiveLock"`)) 277 } 278 279 // Only the main worker thread, worker 0, will run COPY on all the test tables 280 for _, dataTable := range dataTables { 281 expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, dataTable) 282 Expect(stdout).To(ContainSubstring(expectedString)) 283 } 284 285 Expect(stdout).To(ContainSubstring("Backup completed successfully")) 286 }) 287 It("runs gpbackup and defers 2 deadlocked tables to main worker", func() { 288 if useOldBackupVersion || backupConn.Version.Before(backup.SNAPSHOT_GPDB_MIN_VERSION) { 289 Skip(fmt.Sprintf("This test is not needed for old backup versions or GPDB versions < %s", backup.SNAPSHOT_GPDB_MIN_VERSION)) 290 } 291 // Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts 292 // to grab AccessShareLocks before its metadata dump section. 293 backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE") 294 295 args := []string{ 296 "--dbname", "testdb", 297 "--backup-dir", backupDir, 298 "--jobs", "2", 299 "--verbose"} 300 cmd := exec.Command(gpbackupPath, args...) 301 // Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once 302 // that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its 303 // trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the 304 // beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump. 305 anotherConn := testutils.SetupTestDbConn("testdb") 306 defer anotherConn.Close() 307 go func() { 308 // Query to see if gpbackup's AccessShareLock request on public.foo is blocked 309 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 310 311 // Wait up to 10 seconds for gpbackup to block 312 var gpbackupBlockedLockCount int 313 iterations := 100 314 for iterations > 0 { 315 _ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 316 if gpbackupBlockedLockCount < 1 { 317 time.Sleep(100 * time.Millisecond) 318 iterations-- 319 } else { 320 break 321 } 322 } 323 324 // Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup 325 // during the trigger metadata dump so that the test can queue a bunch of 326 // AccessExclusiveLock requests against the test tables. Afterwards, release the 327 // AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump. 328 anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`) 329 backupConn.MustExec("COMMIT") 330 }() 331 332 // Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we 333 // see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM 334 // FULL) on two of the test tables. 335 dataTables := []string{"public.holds", "public.sales", "public.bigtable", 336 "schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"} 337 lockedTables := []string{`public."FOObar"`, "public.foo"} 338 for _, lockedTable := range lockedTables { 339 go func(lockedTable string) { 340 accessExclusiveLockConn := testutils.SetupTestDbConn("testdb") 341 defer accessExclusiveLockConn.Close() 342 343 // Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked 344 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 345 346 // Wait up to 10 seconds for gpbackup to block 347 var gpbackupBlockedLockCount int 348 iterations := 100 349 for iterations > 0 { 350 _ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 351 if gpbackupBlockedLockCount < 1 { 352 time.Sleep(100 * time.Millisecond) 353 iterations-- 354 } else { 355 break 356 } 357 } 358 // Queue an AccessExclusiveLock request on a test table which will later 359 // result in a detected deadlock during the gpbackup data dump section. 360 accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, lockedTable)) 361 }(lockedTable) 362 } 363 364 // Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block. 365 // Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock 366 // gpbackup and let gpbackup move forward to the data dump section. 367 var accessExclBlockedLockCount int 368 go func() { 369 // Query to check for ungranted AccessExclusiveLock requests on our test tables 370 checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'` 371 372 // Wait up to 10 seconds 373 iterations := 100 374 for iterations > 0 { 375 _ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery) 376 if accessExclBlockedLockCount < 9 { 377 time.Sleep(100 * time.Millisecond) 378 iterations-- 379 } else { 380 break 381 } 382 } 383 384 // Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger 385 anotherConn.MustExec("COMMIT") 386 }() 387 388 // gpbackup has finished 389 output, _ := cmd.CombinedOutput() 390 stdout := string(output) 391 392 // Check that 2 deadlock traps were placed during the test 393 Expect(accessExclBlockedLockCount).To(Equal(2)) 394 // No non-main worker should have been able to run COPY due to deadlock detection 395 for i := 1; i < backupConn.NumConns; i++ { 396 expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i) 397 Expect(stdout).To(ContainSubstring(expectedLockString)) 398 399 expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i) 400 Expect(stdout).To(ContainSubstring(expectedWarnString)) 401 402 unexpectedCopyString := fmt.Sprintf(`[DEBUG]:-Worker %d: Executing "COPY `, i) 403 Expect(stdout).To(ContainSubstring(unexpectedCopyString)) 404 } 405 406 // Only the main worker thread, worker 0, will run COPY on the 2 locked test tables 407 for _, lockedTable := range lockedTables { 408 expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, lockedTable) 409 Expect(stdout).To(ContainSubstring(expectedString)) 410 } 411 for _, dataTable := range dataTables { 412 unexpectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: Executing "COPY %s`, dataTable) 413 Expect(stdout).ToNot(ContainSubstring(unexpectedString)) 414 } 415 Expect(stdout).To(ContainSubstring("Backup completed successfully")) 416 }) 417 })