github.com/tuhaihe/gpbackup@v1.0.3/end_to_end/locks_test.go (about) 1 package end_to_end_test 2 3 import ( 4 "fmt" 5 "os/exec" 6 "time" 7 8 "github.com/tuhaihe/gp-common-go-libs/testhelper" 9 "github.com/tuhaihe/gpbackup/backup" 10 "github.com/tuhaihe/gpbackup/testutils" 11 . "github.com/onsi/ginkgo/v2" 12 . "github.com/onsi/gomega" 13 ) 14 15 var _ = Describe("Deadlock handling", func() { 16 BeforeEach(func() { 17 end_to_end_setup() 18 testhelper.AssertQueryRuns(backupConn, "CREATE table bigtable(id int unique); INSERT INTO bigtable SELECT generate_series(1,1000000)") 19 }) 20 AfterEach(func() { 21 end_to_end_teardown() 22 testhelper.AssertQueryRuns(backupConn, "DROP table bigtable") 23 }) 24 It("runs gpbackup with jobs flag and COPY deadlock handling occurs", func() { 25 Skip("Cloudberry skip") 26 if useOldBackupVersion { 27 Skip("This test is not needed for old backup versions") 28 } 29 // Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts 30 // to grab AccessShareLocks before its metadata dump section. 31 backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE") 32 33 // Execute gpbackup with --jobs 10 since there are 10 tables to back up 34 args := []string{ 35 "--dbname", "testdb", 36 "--backup-dir", backupDir, 37 "--jobs", "10", 38 "--verbose"} 39 cmd := exec.Command(gpbackupPath, args...) 40 // Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once 41 // that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its 42 // trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the 43 // beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump. 44 anotherConn := testutils.SetupTestDbConn("testdb") 45 defer anotherConn.Close() 46 go func() { 47 // Query to see if gpbackup's AccessShareLock request on public.foo is blocked 48 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 49 50 // Wait up to 10 seconds for gpbackup to block 51 var gpbackupBlockedLockCount int 52 iterations := 100 53 for iterations > 0 { 54 _ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 55 if gpbackupBlockedLockCount < 1 { 56 time.Sleep(100 * time.Millisecond) 57 iterations-- 58 } else { 59 break 60 } 61 } 62 63 // Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup 64 // during the trigger metadata dump so that the test can queue a bunch of 65 // AccessExclusiveLock requests against the test tables. Afterwards, release the 66 // AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump. 67 anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`) 68 backupConn.MustExec("COMMIT") 69 }() 70 71 // Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we 72 // see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM 73 // FULL) on all the test tables. 74 dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable", 75 "schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"} 76 for _, dataTable := range dataTables { 77 go func(dataTable string) { 78 accessExclusiveLockConn := testutils.SetupTestDbConn("testdb") 79 defer accessExclusiveLockConn.Close() 80 81 // Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked 82 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 83 84 // Wait up to 10 seconds for gpbackup to block 85 var gpbackupBlockedLockCount int 86 iterations := 100 87 for iterations > 0 { 88 _ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 89 if gpbackupBlockedLockCount < 1 { 90 time.Sleep(100 * time.Millisecond) 91 iterations-- 92 } else { 93 break 94 } 95 } 96 97 // Queue an AccessExclusiveLock request on a test table which will later 98 // result in a detected deadlock during the gpbackup data dump section. 99 accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable)) 100 }(dataTable) 101 } 102 103 // Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block. 104 // Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock 105 // gpbackup and let gpbackup move forward to the data dump section. 106 var accessExclBlockedLockCount int 107 go func() { 108 // Query to check for ungranted AccessExclusiveLock requests on our test tables 109 checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'` 110 111 // Wait up to 10 seconds 112 iterations := 100 113 for iterations > 0 { 114 _ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery) 115 if accessExclBlockedLockCount < 10 { 116 time.Sleep(100 * time.Millisecond) 117 iterations-- 118 } else { 119 break 120 } 121 } 122 123 // Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger 124 anotherConn.MustExec("COMMIT") 125 }() 126 127 // gpbackup has finished 128 output, _ := cmd.CombinedOutput() 129 stdout := string(output) 130 131 // Check that 10 deadlock traps were placed during the test 132 Expect(accessExclBlockedLockCount).To(Equal(0)) 133 // No non-main worker should have been able to run COPY due to deadlock detection 134 for i := 1; i < 10; i++ { 135 expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i) 136 Expect(stdout).To(ContainSubstring(expectedLockString)) 137 138 expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i) 139 Expect(stdout).To(ContainSubstring(expectedWarnString)) 140 141 unexpectedCopyString := fmt.Sprintf("[DEBUG]:-Worker %d: COPY ", i) 142 Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString)) 143 } 144 145 // Only the main worker thread, worker 0, will run COPY on all the test tables 146 for _, dataTable := range dataTables { 147 expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, dataTable) 148 Expect(stdout).To(ContainSubstring(expectedString)) 149 } 150 151 Expect(stdout).To(ContainSubstring("Backup completed successfully")) 152 }) 153 It("runs gpbackup with copy-queue-size flag and COPY deadlock handling occurs", func() { 154 Skip("Cloudberry skip") 155 if useOldBackupVersion { 156 Skip("This test is not needed for old backup versions") 157 } 158 // Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts 159 // to grab AccessShareLocks before its metadata dump section. 160 backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE") 161 162 // Execute gpbackup with --copy-queue-size 2 163 args := []string{ 164 "--dbname", "testdb", 165 "--backup-dir", backupDir, 166 "--single-data-file", 167 "--copy-queue-size", "2", 168 "--verbose"} 169 cmd := exec.Command(gpbackupPath, args...) 170 171 // Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once 172 // that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its 173 // trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the 174 // beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump. 175 anotherConn := testutils.SetupTestDbConn("testdb") 176 defer anotherConn.Close() 177 go func() { 178 // Query to see if gpbackup's AccessShareLock request on public.foo is blocked 179 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 180 181 // Wait up to 10 seconds for gpbackup to block 182 var gpbackupBlockedLockCount int 183 iterations := 100 184 for iterations > 0 { 185 _ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 186 if gpbackupBlockedLockCount < 1 { 187 time.Sleep(100 * time.Millisecond) 188 iterations-- 189 } else { 190 break 191 } 192 } 193 194 // Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup 195 // during the trigger metadata dump so that the test can queue a bunch of 196 // AccessExclusiveLock requests against the test tables. Afterwards, release the 197 // AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump. 198 anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`) 199 backupConn.MustExec("COMMIT") 200 }() 201 202 // Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we 203 // see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM 204 // FULL) on all the test tables. 205 dataTables := []string{`public."FOObar"`, "public.foo", "public.holds", "public.sales", "public.bigtable", 206 "schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"} 207 for _, dataTable := range dataTables { 208 go func(dataTable string) { 209 accessExclusiveLockConn := testutils.SetupTestDbConn("testdb") 210 defer accessExclusiveLockConn.Close() 211 212 // Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked 213 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 214 215 // Wait up to 10 seconds for gpbackup to block 216 var gpbackupBlockedLockCount int 217 iterations := 100 218 for iterations > 0 { 219 _ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 220 if gpbackupBlockedLockCount < 1 { 221 time.Sleep(100 * time.Millisecond) 222 iterations-- 223 } else { 224 break 225 } 226 } 227 228 // Queue an AccessExclusiveLock request on a test table which will later 229 // result in a detected deadlock during the gpbackup data dump section. 230 accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, dataTable)) 231 }(dataTable) 232 } 233 234 // Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block. 235 // Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock 236 // gpbackup and let gpbackup move forward to the data dump section. 237 var accessExclBlockedLockCount int 238 go func() { 239 Skip("Cloudberry skip") 240 // Query to check for ungranted AccessExclusiveLock requests on our test tables 241 checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'` 242 243 // Wait up to 10 seconds 244 iterations := 100 245 for iterations > 0 { 246 _ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery) 247 if accessExclBlockedLockCount < 10 { 248 time.Sleep(100 * time.Millisecond) 249 iterations-- 250 } else { 251 break 252 } 253 } 254 255 // Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger 256 anotherConn.MustExec("COMMIT") 257 }() 258 259 // gpbackup has finished 260 output, _ := cmd.CombinedOutput() 261 stdout := string(output) 262 263 // Check that 10 deadlock traps were placed during the test 264 Expect(accessExclBlockedLockCount).To(Equal(10)) 265 // No non-main worker should have been able to run COPY due to deadlock detection 266 for i := 1; i < 2; i++ { 267 expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i) 268 Expect(stdout).To(ContainSubstring(expectedLockString)) 269 270 expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i) 271 Expect(stdout).To(ContainSubstring(expectedWarnString)) 272 273 unexpectedCopyString := fmt.Sprintf("[DEBUG]:-Worker %d: COPY ", i) 274 Expect(stdout).ToNot(ContainSubstring(unexpectedCopyString)) 275 276 expectedLockString = fmt.Sprintf(`Locks held on table %s`, dataTables[i]) 277 Expect(stdout).To(ContainSubstring(expectedLockString)) 278 279 Expect(stdout).To(ContainSubstring(`"Mode":"AccessExclusiveLock"`)) 280 } 281 282 // Only the main worker thread, worker 0, will run COPY on all the test tables 283 for _, dataTable := range dataTables { 284 expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, dataTable) 285 Expect(stdout).To(ContainSubstring(expectedString)) 286 } 287 288 Expect(stdout).To(ContainSubstring("Backup completed successfully")) 289 }) 290 It("runs gpbackup and defers 2 deadlocked tables to main worker", func() { 291 if true { 292 Skip(fmt.Sprintf("This test is not needed for old backup versions or GPDB versions < %s", backup.SNAPSHOT_GPDB_MIN_VERSION)) 293 } 294 // Acquire AccessExclusiveLock on public.foo to block gpbackup when it attempts 295 // to grab AccessShareLocks before its metadata dump section. 296 backupConn.MustExec("BEGIN; LOCK TABLE public.foo IN ACCESS EXCLUSIVE MODE") 297 298 args := []string{ 299 "--dbname", "testdb", 300 "--backup-dir", backupDir, 301 "--jobs", "2", 302 "--verbose"} 303 cmd := exec.Command(gpbackupPath, args...) 304 // Concurrently wait for gpbackup to block when it requests an AccessShareLock on public.foo. Once 305 // that happens, acquire an AccessExclusiveLock on pg_catalog.pg_trigger to block gpbackup during its 306 // trigger metadata dump. Then release the initial AccessExclusiveLock on public.foo (from the 307 // beginning of the test) to unblock gpbackup and let gpbackup move forward to the trigger metadata dump. 308 anotherConn := testutils.SetupTestDbConn("testdb") 309 defer anotherConn.Close() 310 go func() { 311 // Query to see if gpbackup's AccessShareLock request on public.foo is blocked 312 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'public' AND c.relname = 'foo' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 313 314 // Wait up to 10 seconds for gpbackup to block 315 var gpbackupBlockedLockCount int 316 iterations := 100 317 for iterations > 0 { 318 _ = anotherConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 319 if gpbackupBlockedLockCount < 1 { 320 time.Sleep(100 * time.Millisecond) 321 iterations-- 322 } else { 323 break 324 } 325 } 326 327 // Queue AccessExclusiveLock request on pg_catalog.pg_trigger to block gpbackup 328 // during the trigger metadata dump so that the test can queue a bunch of 329 // AccessExclusiveLock requests against the test tables. Afterwards, release the 330 // AccessExclusiveLock on public.foo to let gpbackup go to the trigger metadata dump. 331 anotherConn.MustExec(`BEGIN; LOCK TABLE pg_catalog.pg_trigger IN ACCESS EXCLUSIVE MODE`) 332 backupConn.MustExec("COMMIT") 333 }() 334 335 // Concurrently wait for gpbackup to block on the trigger metadata dump section. Once we 336 // see gpbackup blocked, request AccessExclusiveLock (to imitate a TRUNCATE or VACUUM 337 // FULL) on two of the test tables. 338 dataTables := []string{"public.holds", "public.sales", "public.bigtable", 339 "schema2.ao1", "schema2.ao2", "schema2.foo2", "schema2.foo3", "schema2.returns"} 340 lockedTables := []string{`public."FOObar"`, "public.foo"} 341 for _, lockedTable := range lockedTables { 342 go func(lockedTable string) { 343 accessExclusiveLockConn := testutils.SetupTestDbConn("testdb") 344 defer accessExclusiveLockConn.Close() 345 346 // Query to see if gpbackup's AccessShareLock request on pg_catalog.pg_trigger is blocked 347 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'pg_catalog' AND c.relname = 'pg_trigger' AND l.granted = 'f' AND l.mode = 'AccessShareLock'` 348 349 // Wait up to 10 seconds for gpbackup to block 350 var gpbackupBlockedLockCount int 351 iterations := 100 352 for iterations > 0 { 353 _ = accessExclusiveLockConn.Get(&gpbackupBlockedLockCount, checkLockQuery) 354 if gpbackupBlockedLockCount < 1 { 355 time.Sleep(100 * time.Millisecond) 356 iterations-- 357 } else { 358 break 359 } 360 } 361 // Queue an AccessExclusiveLock request on a test table which will later 362 // result in a detected deadlock during the gpbackup data dump section. 363 accessExclusiveLockConn.MustExec(fmt.Sprintf(`BEGIN; LOCK TABLE %s IN ACCESS EXCLUSIVE MODE; COMMIT`, lockedTable)) 364 }(lockedTable) 365 } 366 367 // Concurrently wait for all AccessExclusiveLock requests on all 10 test tables to block. 368 // Once that happens, release the AccessExclusiveLock on pg_catalog.pg_trigger to unblock 369 // gpbackup and let gpbackup move forward to the data dump section. 370 var accessExclBlockedLockCount int 371 go func() { 372 // Query to check for ungranted AccessExclusiveLock requests on our test tables 373 checkLockQuery := `SELECT count(*) FROM pg_locks WHERE granted = 'f' AND mode = 'AccessExclusiveLock'` 374 375 // Wait up to 10 seconds 376 iterations := 100 377 for iterations > 0 { 378 _ = backupConn.Get(&accessExclBlockedLockCount, checkLockQuery) 379 if accessExclBlockedLockCount < 9 { 380 time.Sleep(100 * time.Millisecond) 381 iterations-- 382 } else { 383 break 384 } 385 } 386 387 // Unblock gpbackup by releasing AccessExclusiveLock on pg_catalog.pg_trigger 388 anotherConn.MustExec("COMMIT") 389 }() 390 391 // gpbackup has finished 392 output, _ := cmd.CombinedOutput() 393 stdout := string(output) 394 395 // Check that 2 deadlock traps were placed during the test 396 Expect(accessExclBlockedLockCount).To(Equal(2)) 397 // No non-main worker should have been able to run COPY due to deadlock detection 398 for i := 1; i < backupConn.NumConns; i++ { 399 expectedLockString := fmt.Sprintf("[DEBUG]:-Worker %d: LOCK TABLE ", i) 400 Expect(stdout).To(ContainSubstring(expectedLockString)) 401 402 expectedWarnString := fmt.Sprintf("[WARNING]:-Worker %d could not acquire AccessShareLock for table", i) 403 Expect(stdout).To(ContainSubstring(expectedWarnString)) 404 405 unexpectedCopyString := fmt.Sprintf("[DEBUG]:-Worker %d: COPY ", i) 406 Expect(stdout).To(ContainSubstring(unexpectedCopyString)) 407 } 408 409 // Only the main worker thread, worker 0, will run COPY on the 2 locked test tables 410 for _, lockedTable := range lockedTables { 411 expectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, lockedTable) 412 Expect(stdout).To(ContainSubstring(expectedString)) 413 } 414 for _, dataTable := range dataTables { 415 unexpectedString := fmt.Sprintf(`[DEBUG]:-Worker 0: COPY %s`, dataTable) 416 Expect(stdout).ToNot(ContainSubstring(unexpectedString)) 417 } 418 Expect(stdout).To(ContainSubstring("Backup completed successfully")) 419 }) 420 })