github.com/greenplum-db/gpbackup@v0.0.0-20240517212602-89daab1885b3/end_to_end/signal_handler_test.go (about) 1 package end_to_end_test 2 3 import ( 4 "math/rand" 5 "os/exec" 6 "time" 7 8 "github.com/greenplum-db/gp-common-go-libs/testhelper" 9 . "github.com/onsi/ginkgo/v2" 10 . "github.com/onsi/gomega" 11 "golang.org/x/sys/unix" 12 ) 13 14 var _ = Describe("Signal handler tests", func() { 15 BeforeEach(func() { 16 end_to_end_setup() 17 testhelper.AssertQueryRuns(backupConn, "CREATE table bigtable(id int unique); INSERT INTO bigtable SELECT generate_series(1,10000000)") 18 }) 19 AfterEach(func() { 20 end_to_end_teardown() 21 testhelper.AssertQueryRuns(backupConn, "DROP TABLE bigtable") 22 }) 23 Context("SIGINT", func() { 24 It("runs gpbackup and sends a SIGINT to ensure cleanup functions successfully", func() { 25 if useOldBackupVersion { 26 Skip("This test is not needed for old backup versions") 27 } 28 args := []string{"--dbname", "testdb", 29 "--backup-dir", backupDir, 30 "--single-data-file", 31 "--verbose"} 32 cmd := exec.Command(gpbackupPath, args...) 33 go func() { 34 /* 35 * We use a random delay for the sleep in this test (between 36 * 0.5s and 1.5s) so that gpbackup will be interrupted at a 37 * different point in the backup process every time to help 38 * catch timing issues with the cleanup. 39 */ 40 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 41 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 42 _ = cmd.Process.Signal(unix.SIGINT) 43 }() 44 output, _ := cmd.CombinedOutput() 45 stdout := string(output) 46 Expect(stdout).To(ContainSubstring("Received an interrupt signal, aborting backup process")) 47 Expect(stdout).To(ContainSubstring("Cleanup complete")) 48 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 49 timestamp := getBackupTimestamp(stdout) 50 if timestamp != "" { // empty timestamp means backup was killed before generating timestamp 51 assertArtifactsCleaned(timestamp) 52 } 53 }) 54 It("runs gpbackup with copy-queue-size and sends a SIGINT to ensure cleanup functions successfully", func() { 55 if useOldBackupVersion { 56 Skip("This test is not needed for old backup versions") 57 } 58 args := []string{"--dbname", "testdb", 59 "--backup-dir", backupDir, 60 "--single-data-file", 61 "--copy-queue-size", "4", 62 "--verbose"} 63 cmd := exec.Command(gpbackupPath, args...) 64 go func() { 65 /* 66 * We use a random delay for the sleep in this test (between 67 * 0.5s and 1.5s) so that gpbackup will be interrupted at a 68 * different point in the backup process every time to help 69 * catch timing issues with the cleanup. 70 */ 71 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 72 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 73 _ = cmd.Process.Signal(unix.SIGINT) 74 }() 75 output, _ := cmd.CombinedOutput() 76 stdout := string(output) 77 Expect(stdout).To(ContainSubstring("Received an interrupt signal, aborting backup process")) 78 Expect(stdout).To(ContainSubstring("Cleanup complete")) 79 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 80 timestamp := getBackupTimestamp(stdout) 81 if timestamp != "" { // empty timestamp means backup was killed before generating timestamp 82 assertArtifactsCleaned(timestamp) 83 } 84 }) 85 It("runs gpbackup and sends a SIGINT to ensure blocked LOCK TABLE query is canceled", func() { 86 if useOldBackupVersion { 87 Skip("This test is not needed for old backup versions") 88 } 89 90 // Query to see if gpbackup lock acquire on schema2.foo2 is blocked 91 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'schema2' AND c.relname = 'foo2' AND l.granted = 'f'` 92 93 // Acquire AccessExclusiveLock on schema2.foo2 to prevent gpbackup from acquiring AccessShareLock 94 backupConn.MustExec("BEGIN; LOCK TABLE schema2.foo2 IN ACCESS EXCLUSIVE MODE") 95 args := []string{ 96 "--dbname", "testdb", 97 "--backup-dir", backupDir, 98 "--verbose"} 99 cmd := exec.Command(gpbackupPath, args...) 100 101 // Wait up to 5 seconds for gpbackup to block on acquiring AccessShareLock. 102 // Once blocked, we send a SIGINT to cancel gpbackup. 103 var beforeLockCount int 104 go func() { 105 iterations := 50 106 for iterations > 0 { 107 _ = backupConn.Get(&beforeLockCount, checkLockQuery) 108 if beforeLockCount < 1 { 109 time.Sleep(100 * time.Millisecond) 110 iterations-- 111 } else { 112 break 113 } 114 } 115 _ = cmd.Process.Signal(unix.SIGINT) 116 }() 117 output, _ := cmd.CombinedOutput() 118 Expect(beforeLockCount).To(Equal(1)) 119 120 // After gpbackup has been canceled, we should no longer see a blocked SQL 121 // session trying to acquire AccessShareLock on foo2. 122 var afterLockCount int 123 _ = backupConn.Get(&afterLockCount, checkLockQuery) 124 Expect(afterLockCount).To(Equal(0)) 125 backupConn.MustExec("ROLLBACK") 126 127 stdout := string(output) 128 Expect(stdout).To(ContainSubstring("Received an interrupt signal, aborting backup process")) 129 Expect(stdout).To(ContainSubstring("Interrupt received while acquiring ACCESS SHARE locks on tables")) 130 Expect(stdout).To(ContainSubstring("Cleanup complete")) 131 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 132 timestamp := getBackupTimestamp(stdout) 133 assertArtifactsCleaned(timestamp) 134 }) 135 It("runs gpbackup with single-data-file and sends a SIGINT to ensure blocked LOCK TABLE query is canceled", func() { 136 if useOldBackupVersion { 137 Skip("This test is not needed for old backup versions") 138 } 139 140 // Query to see if gpbackup lock acquire on schema2.foo2 is blocked 141 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'schema2' AND c.relname = 'foo2' AND l.granted = 'f'` 142 143 // Acquire AccessExclusiveLock on schema2.foo2 to prevent gpbackup from acquiring AccessShareLock 144 backupConn.MustExec("BEGIN; LOCK TABLE schema2.foo2 IN ACCESS EXCLUSIVE MODE") 145 args := []string{ 146 "--dbname", "testdb", 147 "--backup-dir", backupDir, 148 "--single-data-file", 149 "--verbose"} 150 cmd := exec.Command(gpbackupPath, args...) 151 152 // Wait up to 5 seconds for gpbackup to block on acquiring AccessShareLock. 153 // Once blocked, we send a SIGINT to cancel gpbackup. 154 var beforeLockCount int 155 go func() { 156 iterations := 50 157 for iterations > 0 { 158 _ = backupConn.Get(&beforeLockCount, checkLockQuery) 159 if beforeLockCount < 1 { 160 time.Sleep(100 * time.Millisecond) 161 iterations-- 162 } else { 163 break 164 } 165 } 166 _ = cmd.Process.Signal(unix.SIGINT) 167 }() 168 output, _ := cmd.CombinedOutput() 169 Expect(beforeLockCount).To(Equal(1)) 170 171 // After gpbackup has been canceled, we should no longer see a blocked SQL 172 // session trying to acquire AccessShareLock on foo2. 173 var afterLockCount int 174 _ = backupConn.Get(&afterLockCount, checkLockQuery) 175 Expect(afterLockCount).To(Equal(0)) 176 backupConn.MustExec("ROLLBACK") 177 178 stdout := string(output) 179 Expect(stdout).To(ContainSubstring("Received an interrupt signal, aborting backup process")) 180 Expect(stdout).To(ContainSubstring("Interrupt received while acquiring ACCESS SHARE locks on tables")) 181 Expect(stdout).To(ContainSubstring("Cleanup complete")) 182 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 183 timestamp := getBackupTimestamp(stdout) 184 assertArtifactsCleaned(timestamp) 185 }) 186 It("runs gprestore and sends a SIGINT to ensure cleanup functions successfully", func() { 187 if useOldBackupVersion { 188 Skip("This test is not needed for old backup versions") 189 } 190 output := gpbackup(gpbackupPath, backupHelperPath, 191 "--backup-dir", backupDir, 192 "--single-data-file") 193 timestamp := getBackupTimestamp(string(output)) 194 args := []string{ 195 "--timestamp", timestamp, 196 "--redirect-db", "restoredb", 197 "--backup-dir", backupDir, 198 "--verbose"} 199 cmd := exec.Command(gprestorePath, args...) 200 go func() { 201 /* 202 * We use a random delay for the sleep in this test (between 203 * 0.5s and 1.5s) so that gprestore will be interrupted at a 204 * different point in the backup process every time to help 205 * catch timing issues with the cleanup. 206 */ 207 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 208 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 209 _ = cmd.Process.Signal(unix.SIGINT) 210 }() 211 output, _ = cmd.CombinedOutput() 212 stdout := string(output) 213 Expect(stdout).To(ContainSubstring("Received an interrupt signal, aborting restore process")) 214 Expect(stdout).To(ContainSubstring("Cleanup complete")) 215 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 216 assertArtifactsCleaned(timestamp) 217 }) 218 It("runs gprestore with copy-queue-size and sends a SIGINT to ensure cleanup functions successfully", func() { 219 if useOldBackupVersion { 220 Skip("This test is not needed for old backup versions") 221 } 222 outputBkp := gpbackup(gpbackupPath, backupHelperPath, 223 "--backup-dir", backupDir, 224 "--single-data-file") 225 timestampBkp := getBackupTimestamp(string(outputBkp)) 226 args := []string{ 227 "--timestamp", timestampBkp, 228 "--redirect-db", "restoredb", 229 "--backup-dir", backupDir, 230 "--verbose", 231 "--copy-queue-size", "4"} 232 cmd := exec.Command(gprestorePath, args...) 233 go func() { 234 /* 235 * We use a random delay for the sleep in this test (between 236 * 0.5s and 1.5s) so that gprestore will be interrupted at a 237 * different point in the backup process every time to help 238 * catch timing issues with the cleanup. 239 */ 240 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 241 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 242 _ = cmd.Process.Signal(unix.SIGINT) 243 }() 244 outputRes, _ := cmd.CombinedOutput() 245 stdoutRes := string(outputRes) 246 Expect(stdoutRes).To(ContainSubstring("Received an interrupt signal, aborting restore process")) 247 Expect(stdoutRes).To(ContainSubstring("Cleanup complete")) 248 Expect(stdoutRes).To(Not(ContainSubstring("CRITICAL"))) 249 assertArtifactsCleaned(timestampBkp) 250 }) 251 }) 252 Context("SIGTERM", func() { 253 It("runs gpbackup and sends a SIGTERM to ensure cleanup functions successfully", func() { 254 if useOldBackupVersion { 255 Skip("This test is not needed for old backup versions") 256 } 257 args := []string{"--dbname", "testdb", 258 "--backup-dir", backupDir, 259 "--single-data-file", 260 "--verbose"} 261 cmd := exec.Command(gpbackupPath, args...) 262 go func() { 263 /* 264 * We use a random delay for the sleep in this test (between 265 * 0.5s and 1.5s) so that gpbackup will be interrupted at a 266 * different point in the backup process every time to help 267 * catch timing issues with the cleanup. 268 */ 269 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 270 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 271 _ = cmd.Process.Signal(unix.SIGTERM) 272 }() 273 output, _ := cmd.CombinedOutput() 274 stdout := string(output) 275 Expect(stdout).To(ContainSubstring("Received a termination signal, aborting backup process")) 276 Expect(stdout).To(ContainSubstring("Cleanup complete")) 277 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 278 timestamp := getBackupTimestamp(stdout) 279 if timestamp != "" { // empty timestamp means backup was killed before generating timestamp 280 assertArtifactsCleaned(timestamp) 281 } 282 }) 283 It("runs gpbackup with copy-queue-size and sends a SIGTERM to ensure cleanup functions successfully", func() { 284 if useOldBackupVersion { 285 Skip("This test is not needed for old backup versions") 286 } 287 args := []string{"--dbname", "testdb", 288 "--backup-dir", backupDir, 289 "--single-data-file", 290 "--copy-queue-size", "4", 291 "--verbose"} 292 cmd := exec.Command(gpbackupPath, args...) 293 go func() { 294 /* 295 * We use a random delay for the sleep in this test (between 296 * 0.5s and 1.5s) so that gpbackup will be interrupted at a 297 * different point in the backup process every time to help 298 * catch timing issues with the cleanup. 299 */ 300 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 301 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 302 _ = cmd.Process.Signal(unix.SIGTERM) 303 }() 304 output, _ := cmd.CombinedOutput() 305 stdout := string(output) 306 Expect(stdout).To(ContainSubstring("Received a termination signal, aborting backup process")) 307 Expect(stdout).To(ContainSubstring("Cleanup complete")) 308 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 309 timestamp := getBackupTimestamp(stdout) 310 if timestamp != "" { // empty timestamp means backup was killed before generating timestamp 311 assertArtifactsCleaned(timestamp) 312 } 313 }) 314 It("runs gpbackup and sends a SIGTERM to ensure blocked LOCK TABLE query is canceled", func() { 315 if useOldBackupVersion { 316 Skip("This test is not needed for old backup versions") 317 } 318 319 // Query to see if gpbackup lock acquire on schema2.foo2 is blocked 320 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'schema2' AND c.relname = 'foo2' AND l.granted = 'f'` 321 322 // Acquire AccessExclusiveLock on schema2.foo2 to prevent gpbackup from acquiring AccessShareLock 323 backupConn.MustExec("BEGIN; LOCK TABLE schema2.foo2 IN ACCESS EXCLUSIVE MODE") 324 args := []string{ 325 "--dbname", "testdb", 326 "--backup-dir", backupDir, 327 "--verbose"} 328 cmd := exec.Command(gpbackupPath, args...) 329 330 // Wait up to 5 seconds for gpbackup to block on acquiring AccessShareLock. 331 // Once blocked, we send a SIGTERM to cancel gpbackup. 332 var beforeLockCount int 333 go func() { 334 iterations := 50 335 for iterations > 0 { 336 _ = backupConn.Get(&beforeLockCount, checkLockQuery) 337 if beforeLockCount < 1 { 338 time.Sleep(100 * time.Millisecond) 339 iterations-- 340 } else { 341 break 342 } 343 } 344 _ = cmd.Process.Signal(unix.SIGTERM) 345 }() 346 output, _ := cmd.CombinedOutput() 347 Expect(beforeLockCount).To(Equal(1)) 348 349 // After gpbackup has been canceled, we should no longer see a blocked SQL 350 // session trying to acquire AccessShareLock on foo2. 351 var afterLockCount int 352 _ = backupConn.Get(&afterLockCount, checkLockQuery) 353 Expect(afterLockCount).To(Equal(0)) 354 backupConn.MustExec("ROLLBACK") 355 356 stdout := string(output) 357 Expect(stdout).To(ContainSubstring("Received a termination signal, aborting backup process")) 358 Expect(stdout).To(ContainSubstring("Interrupt received while acquiring ACCESS SHARE locks on tables")) 359 Expect(stdout).To(ContainSubstring("Cleanup complete")) 360 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 361 timestamp := getBackupTimestamp(stdout) 362 assertArtifactsCleaned(timestamp) 363 }) 364 It("runs gpbackup with single-data-file and sends a SIGTERM to ensure blocked LOCK TABLE query is canceled", func() { 365 if useOldBackupVersion { 366 Skip("This test is not needed for old backup versions") 367 } 368 369 // Query to see if gpbackup lock acquire on schema2.foo2 is blocked 370 checkLockQuery := `SELECT count(*) FROM pg_locks l, pg_class c, pg_namespace n WHERE l.relation = c.oid AND n.oid = c.relnamespace AND n.nspname = 'schema2' AND c.relname = 'foo2' AND l.granted = 'f'` 371 372 // Acquire AccessExclusiveLock on schema2.foo2 to prevent gpbackup from acquiring AccessShareLock 373 backupConn.MustExec("BEGIN; LOCK TABLE schema2.foo2 IN ACCESS EXCLUSIVE MODE") 374 args := []string{ 375 "--dbname", "testdb", 376 "--backup-dir", backupDir, 377 "--single-data-file", 378 "--verbose"} 379 cmd := exec.Command(gpbackupPath, args...) 380 381 // Wait up to 5 seconds for gpbackup to block on acquiring AccessShareLock. 382 // Once blocked, we send a SIGTERM to cancel gpbackup. 383 var beforeLockCount int 384 go func() { 385 iterations := 50 386 for iterations > 0 { 387 _ = backupConn.Get(&beforeLockCount, checkLockQuery) 388 if beforeLockCount < 1 { 389 time.Sleep(100 * time.Millisecond) 390 iterations-- 391 } else { 392 break 393 } 394 } 395 _ = cmd.Process.Signal(unix.SIGTERM) 396 }() 397 output, _ := cmd.CombinedOutput() 398 Expect(beforeLockCount).To(Equal(1)) 399 400 // After gpbackup has been canceled, we should no longer see a blocked SQL 401 // session trying to acquire AccessShareLock on foo2. 402 var afterLockCount int 403 _ = backupConn.Get(&afterLockCount, checkLockQuery) 404 Expect(afterLockCount).To(Equal(0)) 405 backupConn.MustExec("ROLLBACK") 406 407 stdout := string(output) 408 Expect(stdout).To(ContainSubstring("Received a termination signal, aborting backup process")) 409 Expect(stdout).To(ContainSubstring("Cleanup complete")) 410 Expect(stdout).To(Not(ContainSubstring("CRITICAL"))) 411 timestamp := getBackupTimestamp(stdout) 412 assertArtifactsCleaned(timestamp) 413 }) 414 It("runs gprestore and sends a SIGTERM to ensure cleanup functions successfully", func() { 415 if useOldBackupVersion { 416 Skip("This test is not needed for old backup versions") 417 } 418 outputBkp := gpbackup(gpbackupPath, backupHelperPath, 419 "--backup-dir", backupDir, 420 "--single-data-file") 421 timestampBkp := getBackupTimestamp(string(outputBkp)) 422 args := []string{ 423 "--timestamp", timestampBkp, 424 "--redirect-db", "restoredb", 425 "--backup-dir", backupDir, 426 "--verbose"} 427 cmd := exec.Command(gprestorePath, args...) 428 go func() { 429 /* 430 * We use a random delay for the sleep in this test (between 431 * 0.5s and 1.5s) so that gprestore will be interrupted at a 432 * different point in the backup process every time to help 433 * catch timing issues with the cleanup. 434 */ 435 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 436 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 437 _ = cmd.Process.Signal(unix.SIGTERM) 438 }() 439 outputRes, _ := cmd.CombinedOutput() 440 stdoutRes := string(outputRes) 441 Expect(stdoutRes).To(ContainSubstring("Received a termination signal, aborting restore process")) 442 Expect(stdoutRes).To(ContainSubstring("Cleanup complete")) 443 Expect(stdoutRes).To(Not(ContainSubstring("CRITICAL"))) 444 assertArtifactsCleaned(timestampBkp) 445 }) 446 It("runs gprestore with copy-queue-size and sends a SIGTERM to ensure cleanup functions successfully", func() { 447 if useOldBackupVersion { 448 Skip("This test is not needed for old backup versions") 449 } 450 outputBkp := gpbackup(gpbackupPath, backupHelperPath, 451 "--backup-dir", backupDir, 452 "--single-data-file") 453 timestampBkp := getBackupTimestamp(string(outputBkp)) 454 args := []string{ 455 "--timestamp", timestampBkp, 456 "--redirect-db", "restoredb", 457 "--backup-dir", backupDir, 458 "--verbose", 459 "--copy-queue-size", "4"} 460 cmd := exec.Command(gprestorePath, args...) 461 go func() { 462 /* 463 * We use a random delay for the sleep in this test (between 464 * 0.5s and 1.5s) so that gprestore will be interrupted at a 465 * different point in the backup process every time to help 466 * catch timing issues with the cleanup. 467 */ 468 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 469 time.Sleep(time.Duration(rng.Intn(1000)+500) * time.Millisecond) 470 _ = cmd.Process.Signal(unix.SIGTERM) 471 }() 472 outputRes, _ := cmd.CombinedOutput() 473 stdoutRes := string(outputRes) 474 Expect(stdoutRes).To(ContainSubstring("Received a termination signal, aborting restore process")) 475 Expect(stdoutRes).To(ContainSubstring("Cleanup complete")) 476 Expect(stdoutRes).To(Not(ContainSubstring("CRITICAL"))) 477 assertArtifactsCleaned(timestampBkp) 478 }) 479 }) 480 })