github.com/cloudberrydb/gpbackup@v1.0.3-0.20240118031043-5410fd45eed6/utils/agent_remote.go (about) 1 package utils 2 3 import ( 4 "fmt" 5 "io" 6 path "path/filepath" 7 "strings" 8 "sync" 9 10 "github.com/cloudberrydb/gp-common-go-libs/cluster" 11 "github.com/cloudberrydb/gp-common-go-libs/gplog" 12 "github.com/cloudberrydb/gp-common-go-libs/iohelper" 13 "github.com/cloudberrydb/gp-common-go-libs/operating" 14 "github.com/cloudberrydb/gpbackup/filepath" 15 "github.com/pkg/errors" 16 ) 17 18 var helperMutex sync.Mutex 19 20 /* 21 * Functions to run commands on entire cluster during both backup and restore 22 */ 23 24 /* 25 * The reason that gprestore is in charge of creating the first pipe to ensure 26 * that the first pipe is created before the first COPY FROM is issued. If 27 * gpbackup_helper was in charge of creating the first pipe, there is a 28 * possibility that the COPY FROM commands start before gpbackup_helper is done 29 * starting up and setting up the first pipe. 30 */ 31 func CreateSegmentPipeOnAllHosts(oid string, c *cluster.Cluster, fpInfo filepath.FilePathInfo) { 32 remoteOutput := c.GenerateAndExecuteCommand("Creating segment data pipes", cluster.ON_SEGMENTS, func(contentID int) string { 33 pipeName := fpInfo.GetSegmentPipeFilePath(contentID) 34 pipeName = fmt.Sprintf("%s_%s", pipeName, oid) 35 gplog.Debug("Creating pipe %s", pipeName) 36 return fmt.Sprintf("mkfifo %s", pipeName) 37 }) 38 c.CheckClusterError(remoteOutput, "Unable to create segment data pipes", func(contentID int) string { 39 return "Unable to create segment data pipe" 40 }) 41 } 42 43 func WriteOidListToSegments(oidList []string, c *cluster.Cluster, fpInfo filepath.FilePathInfo, fileSuffix string) { 44 rsync_exists := CommandExists("rsync") 45 if !rsync_exists { 46 gplog.Fatal(errors.New("Failed to find rsync on PATH. Please ensure rsync is installed."), "") 47 } 48 49 localOidFile, err := operating.System.TempFile("", "gpbackup-oids") 50 gplog.FatalOnError(err, "Cannot open temporary file to write oids") 51 defer func() { 52 err = operating.System.Remove(localOidFile.Name()) 53 if err != nil { 54 gplog.Warn("Cannot remove temporary oid file: %s, Err: %s", localOidFile.Name(), err.Error()) 55 } 56 }() 57 58 WriteOidsToFile(localOidFile.Name(), oidList) 59 60 generateScpCmd := func(contentID int) string { 61 sourceFile := localOidFile.Name() 62 hostname := c.GetHostForContent(contentID) 63 dest := fpInfo.GetSegmentHelperFilePath(contentID, fileSuffix) 64 65 return fmt.Sprintf(`rsync -e ssh %s %s:%s`, sourceFile, hostname, dest) 66 } 67 remoteOutput := c.GenerateAndExecuteCommand("rsync oid file to segments", cluster.ON_LOCAL|cluster.ON_SEGMENTS, generateScpCmd) 68 69 errMsg := "Failed to rsync oid file" 70 errFunc := func(contentID int) string { 71 return "Failed to run rsync" 72 } 73 c.CheckClusterError(remoteOutput, errMsg, errFunc, false) 74 } 75 76 func WriteOidsToFile(filename string, oidList []string) { 77 oidFp, err := iohelper.OpenFileForWriting(filename) 78 gplog.FatalOnError(err, filename) 79 defer func() { 80 err = oidFp.Close() 81 gplog.FatalOnError(err, filename) 82 }() 83 84 err = WriteOids(oidFp, oidList) 85 gplog.FatalOnError(err, filename) 86 } 87 88 func WriteOids(writer io.Writer, oidList []string) error { 89 var err error 90 for _, oid := range oidList { 91 _, err = writer.Write([]byte(oid + "\n")) 92 if err != nil { 93 // error logging handled in calling functions 94 return err 95 } 96 } 97 98 return nil 99 } 100 101 func VerifyHelperVersionOnSegments(version string, c *cluster.Cluster) { 102 remoteOutput := c.GenerateAndExecuteCommand("Verifying gpbackup_helper version", cluster.ON_HOSTS, func(contentID int) string { 103 gphome := operating.System.Getenv("GPHOME") 104 return fmt.Sprintf("%s/bin/gpbackup_helper --version", gphome) 105 }) 106 c.CheckClusterError(remoteOutput, "Could not verify gpbackup_helper version", func(contentID int) string { 107 return "Could not verify gpbackup_helper version" 108 }) 109 110 numIncorrect := 0 111 for contentID, cmd := range remoteOutput.Commands { 112 segVersion := strings.TrimSpace(cmd.Stdout) 113 segVersion = strings.Split(segVersion, " ")[2] // Format is "gpbackup_helper version [version string]" 114 if segVersion != version { 115 gplog.Verbose("Version mismatch for gpbackup_helper on segment %d on host %s: Expected version %s, found version %s.", contentID, c.GetHostForContent(contentID), version, segVersion) 116 numIncorrect++ 117 } 118 } 119 if numIncorrect > 0 { 120 cluster.LogFatalClusterError("The version of gpbackup_helper must match the version of gpbackup/gprestore, but found gpbackup_helper binaries with invalid version", cluster.ON_HOSTS, numIncorrect) 121 } 122 } 123 124 func StartGpbackupHelpers(c *cluster.Cluster, fpInfo filepath.FilePathInfo, operation string, pluginConfigFile string, compressStr string, onErrorContinue bool, isFilter bool, wasTerminated *bool, copyQueue int, isSingleDataFile bool, resizeCluster bool, origSize int, destSize int) { 125 // A mutex lock for cleaning up and starting gpbackup helpers prevents a 126 // race condition that causes gpbackup_helpers to be orphaned if 127 // gpbackup_helper cleanup happens before they are started. 128 helperMutex.Lock() 129 if *wasTerminated { 130 helperMutex.Unlock() 131 select {} // Pause forever and wait for cleanup to exit program. 132 } 133 defer helperMutex.Unlock() 134 135 gphomePath := operating.System.Getenv("GPHOME") 136 pluginStr := "" 137 if pluginConfigFile != "" { 138 _, configFilename := path.Split(pluginConfigFile) 139 pluginStr = fmt.Sprintf(" --plugin-config /tmp/%s", configFilename) 140 } 141 onErrorContinueStr := "" 142 if onErrorContinue { 143 onErrorContinueStr = " --on-error-continue" 144 } 145 filterStr := "" 146 if isFilter { 147 filterStr = " --with-filters" 148 } 149 singleDataFileStr := "" 150 if isSingleDataFile { 151 singleDataFileStr = " --single-data-file" 152 } 153 resizeStr := "" 154 if resizeCluster { 155 resizeStr = fmt.Sprintf(" --resize-cluster --orig-seg-count %d --dest-seg-count %d", origSize, destSize) 156 } 157 remoteOutput := c.GenerateAndExecuteCommand("Starting gpbackup_helper agent", cluster.ON_SEGMENTS, func(contentID int) string { 158 tocFile := fpInfo.GetSegmentTOCFilePath(contentID) 159 oidFile := fpInfo.GetSegmentHelperFilePath(contentID, "oid") 160 scriptFile := fpInfo.GetSegmentHelperFilePath(contentID, "script") 161 pipeFile := fpInfo.GetSegmentPipeFilePath(contentID) 162 backupFile := fpInfo.GetTableBackupFilePath(contentID, 0, GetPipeThroughProgram().Extension, true) 163 replicatedOidFile := fpInfo.GetSegmentHelperFilePath(contentID, "replicated_oid") 164 helperCmdStr := fmt.Sprintf(`gpbackup_helper %s --toc-file %s --oid-file %s --pipe-file %s --data-file "%s" --content %d%s%s%s%s%s%s --copy-queue-size %d --replication-file %s`, 165 operation, tocFile, oidFile, pipeFile, backupFile, contentID, pluginStr, compressStr, onErrorContinueStr, filterStr, singleDataFileStr, resizeStr, copyQueue, replicatedOidFile) 166 // we run these commands in sequence to ensure that any failure is critical; the last command ensures the agent process was successfully started 167 return fmt.Sprintf(`cat << HEREDOC > %[1]s && chmod +x %[1]s && ( nohup %[1]s &> /dev/null &) 168 #!/bin/bash 169 source %[2]s/greenplum_path.sh 170 %[2]s/bin/%s 171 172 HEREDOC 173 174 `, scriptFile, gphomePath, helperCmdStr) 175 }) 176 c.CheckClusterError(remoteOutput, "Error starting gpbackup_helper agent", func(contentID int) string { 177 return "Error starting gpbackup_helper agent" 178 }) 179 } 180 181 func CleanUpHelperFilesOnAllHosts(c *cluster.Cluster, fpInfo filepath.FilePathInfo) { 182 remoteOutput := c.GenerateAndExecuteCommand("Removing oid list and helper script files from segment data directories", cluster.ON_SEGMENTS, func(contentID int) string { 183 errorFile := fmt.Sprintf("%s_error", fpInfo.GetSegmentPipeFilePath(contentID)) 184 oidFile := fpInfo.GetSegmentHelperFilePath(contentID, "oid") 185 scriptFile := fpInfo.GetSegmentHelperFilePath(contentID, "script") 186 return fmt.Sprintf("rm -f %s && rm -f %s && rm -f %s", errorFile, oidFile, scriptFile) 187 }) 188 errMsg := fmt.Sprintf("Unable to remove segment helper file(s). See %s for a complete list of segments with errors and remove manually.", 189 gplog.GetLogFilePath()) 190 c.CheckClusterError(remoteOutput, errMsg, func(contentID int) string { 191 errorFile := fmt.Sprintf("%s_error", fpInfo.GetSegmentPipeFilePath(contentID)) 192 return fmt.Sprintf("Unable to remove helper file %s on segment %d on host %s", errorFile, contentID, c.GetHostForContent(contentID)) 193 }, true) 194 } 195 196 func CleanUpSegmentHelperProcesses(c *cluster.Cluster, fpInfo filepath.FilePathInfo, operation string) { 197 helperMutex.Lock() 198 defer helperMutex.Unlock() 199 200 remoteOutput := c.GenerateAndExecuteCommand("Cleaning up segment agent processes", cluster.ON_SEGMENTS, func(contentID int) string { 201 tocFile := fpInfo.GetSegmentTOCFilePath(contentID) 202 procPattern := fmt.Sprintf("gpbackup_helper --%s-agent --toc-file %s", operation, tocFile) 203 /* 204 * We try to avoid erroring out if no gpbackup_helper processes are found, 205 * as it's possible that all gpbackup_helper processes have finished by 206 * the time DoCleanup is called. 207 */ 208 return fmt.Sprintf("PIDS=`ps ux | grep \"%s\" | grep -v grep | awk '{print $2}'`; if [[ ! -z \"$PIDS\" ]]; then kill -USR1 $PIDS; fi", procPattern) 209 }) 210 c.CheckClusterError(remoteOutput, "Unable to clean up agent processes", func(contentID int) string { 211 return "Unable to clean up agent process" 212 }) 213 } 214 215 func CheckAgentErrorsOnSegments(c *cluster.Cluster, fpInfo filepath.FilePathInfo) error { 216 remoteOutput := c.GenerateAndExecuteCommand("Checking whether segment agents had errors", cluster.ON_SEGMENTS, func(contentID int) string { 217 errorFile := fmt.Sprintf("%s_error", fpInfo.GetSegmentPipeFilePath(contentID)) 218 /* 219 * If an error file exists we want to indicate an error, as that means 220 * the agent errored out. If no file exists, the agent was successful. 221 */ 222 return fmt.Sprintf("if [[ -f %s ]]; then echo 'error'; fi; rm -f %s", errorFile, errorFile) 223 }) 224 225 numErrors := 0 226 for contentID, cmd := range remoteOutput.Commands { 227 if strings.TrimSpace(cmd.Stdout) == "error" { 228 gplog.Verbose("Error occurred with helper agent on segment %d on host %s.", contentID, c.GetHostForContent(contentID)) 229 numErrors++ 230 } 231 } 232 if numErrors > 0 { 233 helperLogName := fpInfo.GetHelperLogPath() 234 return errors.Errorf("Encountered errors with %d helper agent(s). See %s for a complete list of segments with errors, and see %s on the corresponding hosts for detailed error messages.", 235 numErrors, gplog.GetLogFilePath(), helperLogName) 236 } 237 return nil 238 } 239 240 func CreateSkipFileOnSegments(oid string, tableName string, c *cluster.Cluster, fpInfo filepath.FilePathInfo) { 241 createSkipFileLogMsg := fmt.Sprintf("Creating skip file on segments for restore entry %s (%s)", oid, tableName) 242 remoteOutput := c.GenerateAndExecuteCommand(createSkipFileLogMsg, cluster.ON_SEGMENTS, func(contentID int) string { 243 return fmt.Sprintf("touch %s_skip_%s", fpInfo.GetSegmentPipeFilePath(contentID), oid) 244 }) 245 c.CheckClusterError(remoteOutput, "Error while creating skip file on segments", func(contentID int) string { 246 return fmt.Sprintf("Could not create skip file %s_skip_%s on segments", fpInfo.GetSegmentPipeFilePath(contentID), oid) 247 }) 248 }