github.com/cloudberrydb/gpbackup@v1.0.3-0.20240118031043-5410fd45eed6/utils/agent_remote.go (about)

     1  package utils
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	path "path/filepath"
     7  	"strings"
     8  	"sync"
     9  
    10  	"github.com/cloudberrydb/gp-common-go-libs/cluster"
    11  	"github.com/cloudberrydb/gp-common-go-libs/gplog"
    12  	"github.com/cloudberrydb/gp-common-go-libs/iohelper"
    13  	"github.com/cloudberrydb/gp-common-go-libs/operating"
    14  	"github.com/cloudberrydb/gpbackup/filepath"
    15  	"github.com/pkg/errors"
    16  )
    17  
    18  var helperMutex sync.Mutex
    19  
    20  /*
    21   * Functions to run commands on entire cluster during both backup and restore
    22   */
    23  
    24  /*
    25   * The reason that gprestore is in charge of creating the first pipe to ensure
    26   * that the first pipe is created before the first COPY FROM is issued.  If
    27   * gpbackup_helper was in charge of creating the first pipe, there is a
    28   * possibility that the COPY FROM commands start before gpbackup_helper is done
    29   * starting up and setting up the first pipe.
    30   */
    31  func CreateSegmentPipeOnAllHosts(oid string, c *cluster.Cluster, fpInfo filepath.FilePathInfo) {
    32  	remoteOutput := c.GenerateAndExecuteCommand("Creating segment data pipes", cluster.ON_SEGMENTS, func(contentID int) string {
    33  		pipeName := fpInfo.GetSegmentPipeFilePath(contentID)
    34  		pipeName = fmt.Sprintf("%s_%s", pipeName, oid)
    35  		gplog.Debug("Creating pipe %s", pipeName)
    36  		return fmt.Sprintf("mkfifo %s", pipeName)
    37  	})
    38  	c.CheckClusterError(remoteOutput, "Unable to create segment data pipes", func(contentID int) string {
    39  		return "Unable to create segment data pipe"
    40  	})
    41  }
    42  
    43  func WriteOidListToSegments(oidList []string, c *cluster.Cluster, fpInfo filepath.FilePathInfo, fileSuffix string) {
    44  	rsync_exists := CommandExists("rsync")
    45  	if !rsync_exists {
    46  		gplog.Fatal(errors.New("Failed to find rsync on PATH. Please ensure rsync is installed."), "")
    47  	}
    48  
    49  	localOidFile, err := operating.System.TempFile("", "gpbackup-oids")
    50  	gplog.FatalOnError(err, "Cannot open temporary file to write oids")
    51  	defer func() {
    52  		err = operating.System.Remove(localOidFile.Name())
    53  		if err != nil {
    54  			gplog.Warn("Cannot remove temporary oid file: %s, Err: %s", localOidFile.Name(), err.Error())
    55  		}
    56  	}()
    57  
    58  	WriteOidsToFile(localOidFile.Name(), oidList)
    59  
    60  	generateScpCmd := func(contentID int) string {
    61  		sourceFile := localOidFile.Name()
    62  		hostname := c.GetHostForContent(contentID)
    63  		dest := fpInfo.GetSegmentHelperFilePath(contentID, fileSuffix)
    64  
    65  		return fmt.Sprintf(`rsync -e ssh %s %s:%s`, sourceFile, hostname, dest)
    66  	}
    67  	remoteOutput := c.GenerateAndExecuteCommand("rsync oid file to segments", cluster.ON_LOCAL|cluster.ON_SEGMENTS, generateScpCmd)
    68  
    69  	errMsg := "Failed to rsync oid file"
    70  	errFunc := func(contentID int) string {
    71  		return "Failed to run rsync"
    72  	}
    73  	c.CheckClusterError(remoteOutput, errMsg, errFunc, false)
    74  }
    75  
    76  func WriteOidsToFile(filename string, oidList []string) {
    77  	oidFp, err := iohelper.OpenFileForWriting(filename)
    78  	gplog.FatalOnError(err, filename)
    79  	defer func() {
    80  		err = oidFp.Close()
    81  		gplog.FatalOnError(err, filename)
    82  	}()
    83  
    84  	err = WriteOids(oidFp, oidList)
    85  	gplog.FatalOnError(err, filename)
    86  }
    87  
    88  func WriteOids(writer io.Writer, oidList []string) error {
    89  	var err error
    90  	for _, oid := range oidList {
    91  		_, err = writer.Write([]byte(oid + "\n"))
    92  		if err != nil {
    93  			// error logging handled in calling functions
    94  			return err
    95  		}
    96  	}
    97  
    98  	return nil
    99  }
   100  
   101  func VerifyHelperVersionOnSegments(version string, c *cluster.Cluster) {
   102  	remoteOutput := c.GenerateAndExecuteCommand("Verifying gpbackup_helper version", cluster.ON_HOSTS, func(contentID int) string {
   103  		gphome := operating.System.Getenv("GPHOME")
   104  		return fmt.Sprintf("%s/bin/gpbackup_helper --version", gphome)
   105  	})
   106  	c.CheckClusterError(remoteOutput, "Could not verify gpbackup_helper version", func(contentID int) string {
   107  		return "Could not verify gpbackup_helper version"
   108  	})
   109  
   110  	numIncorrect := 0
   111  	for contentID, cmd := range remoteOutput.Commands {
   112  		segVersion := strings.TrimSpace(cmd.Stdout)
   113  		segVersion = strings.Split(segVersion, " ")[2] // Format is "gpbackup_helper version [version string]"
   114  		if segVersion != version {
   115  			gplog.Verbose("Version mismatch for gpbackup_helper on segment %d on host %s: Expected version %s, found version %s.", contentID, c.GetHostForContent(contentID), version, segVersion)
   116  			numIncorrect++
   117  		}
   118  	}
   119  	if numIncorrect > 0 {
   120  		cluster.LogFatalClusterError("The version of gpbackup_helper must match the version of gpbackup/gprestore, but found gpbackup_helper binaries with invalid version", cluster.ON_HOSTS, numIncorrect)
   121  	}
   122  }
   123  
   124  func StartGpbackupHelpers(c *cluster.Cluster, fpInfo filepath.FilePathInfo, operation string, pluginConfigFile string, compressStr string, onErrorContinue bool, isFilter bool, wasTerminated *bool, copyQueue int, isSingleDataFile bool, resizeCluster bool, origSize int, destSize int) {
   125  	// A mutex lock for cleaning up and starting gpbackup helpers prevents a
   126  	// race condition that causes gpbackup_helpers to be orphaned if
   127  	// gpbackup_helper cleanup happens before they are started.
   128  	helperMutex.Lock()
   129  	if *wasTerminated {
   130  		helperMutex.Unlock()
   131  		select {} // Pause forever and wait for cleanup to exit program.
   132  	}
   133  	defer helperMutex.Unlock()
   134  
   135  	gphomePath := operating.System.Getenv("GPHOME")
   136  	pluginStr := ""
   137  	if pluginConfigFile != "" {
   138  		_, configFilename := path.Split(pluginConfigFile)
   139  		pluginStr = fmt.Sprintf(" --plugin-config /tmp/%s", configFilename)
   140  	}
   141  	onErrorContinueStr := ""
   142  	if onErrorContinue {
   143  		onErrorContinueStr = " --on-error-continue"
   144  	}
   145  	filterStr := ""
   146  	if isFilter {
   147  		filterStr = " --with-filters"
   148  	}
   149  	singleDataFileStr := ""
   150  	if isSingleDataFile {
   151  		singleDataFileStr = " --single-data-file"
   152  	}
   153  	resizeStr := ""
   154  	if resizeCluster {
   155  		resizeStr = fmt.Sprintf(" --resize-cluster --orig-seg-count %d --dest-seg-count %d", origSize, destSize)
   156  	}
   157  	remoteOutput := c.GenerateAndExecuteCommand("Starting gpbackup_helper agent", cluster.ON_SEGMENTS, func(contentID int) string {
   158  		tocFile := fpInfo.GetSegmentTOCFilePath(contentID)
   159  		oidFile := fpInfo.GetSegmentHelperFilePath(contentID, "oid")
   160  		scriptFile := fpInfo.GetSegmentHelperFilePath(contentID, "script")
   161  		pipeFile := fpInfo.GetSegmentPipeFilePath(contentID)
   162  		backupFile := fpInfo.GetTableBackupFilePath(contentID, 0, GetPipeThroughProgram().Extension, true)
   163  		replicatedOidFile := fpInfo.GetSegmentHelperFilePath(contentID, "replicated_oid")
   164  		helperCmdStr := fmt.Sprintf(`gpbackup_helper %s --toc-file %s --oid-file %s --pipe-file %s --data-file "%s" --content %d%s%s%s%s%s%s --copy-queue-size %d --replication-file %s`,
   165  			operation, tocFile, oidFile, pipeFile, backupFile, contentID, pluginStr, compressStr, onErrorContinueStr, filterStr, singleDataFileStr, resizeStr, copyQueue, replicatedOidFile)
   166  		// we run these commands in sequence to ensure that any failure is critical; the last command ensures the agent process was successfully started
   167  		return fmt.Sprintf(`cat << HEREDOC > %[1]s && chmod +x %[1]s && ( nohup %[1]s &> /dev/null &)
   168  #!/bin/bash
   169  source %[2]s/greenplum_path.sh
   170  %[2]s/bin/%s
   171  
   172  HEREDOC
   173  
   174  `, scriptFile, gphomePath, helperCmdStr)
   175  	})
   176  	c.CheckClusterError(remoteOutput, "Error starting gpbackup_helper agent", func(contentID int) string {
   177  		return "Error starting gpbackup_helper agent"
   178  	})
   179  }
   180  
   181  func CleanUpHelperFilesOnAllHosts(c *cluster.Cluster, fpInfo filepath.FilePathInfo) {
   182  	remoteOutput := c.GenerateAndExecuteCommand("Removing oid list and helper script files from segment data directories", cluster.ON_SEGMENTS, func(contentID int) string {
   183  		errorFile := fmt.Sprintf("%s_error", fpInfo.GetSegmentPipeFilePath(contentID))
   184  		oidFile := fpInfo.GetSegmentHelperFilePath(contentID, "oid")
   185  		scriptFile := fpInfo.GetSegmentHelperFilePath(contentID, "script")
   186  		return fmt.Sprintf("rm -f %s && rm -f %s && rm -f %s", errorFile, oidFile, scriptFile)
   187  	})
   188  	errMsg := fmt.Sprintf("Unable to remove segment helper file(s). See %s for a complete list of segments with errors and remove manually.",
   189  		gplog.GetLogFilePath())
   190  	c.CheckClusterError(remoteOutput, errMsg, func(contentID int) string {
   191  		errorFile := fmt.Sprintf("%s_error", fpInfo.GetSegmentPipeFilePath(contentID))
   192  		return fmt.Sprintf("Unable to remove helper file %s on segment %d on host %s", errorFile, contentID, c.GetHostForContent(contentID))
   193  	}, true)
   194  }
   195  
   196  func CleanUpSegmentHelperProcesses(c *cluster.Cluster, fpInfo filepath.FilePathInfo, operation string) {
   197  	helperMutex.Lock()
   198  	defer helperMutex.Unlock()
   199  
   200  	remoteOutput := c.GenerateAndExecuteCommand("Cleaning up segment agent processes", cluster.ON_SEGMENTS, func(contentID int) string {
   201  		tocFile := fpInfo.GetSegmentTOCFilePath(contentID)
   202  		procPattern := fmt.Sprintf("gpbackup_helper --%s-agent --toc-file %s", operation, tocFile)
   203  		/*
   204  		 * We try to avoid erroring out if no gpbackup_helper processes are found,
   205  		 * as it's possible that all gpbackup_helper processes have finished by
   206  		 * the time DoCleanup is called.
   207  		 */
   208  		return fmt.Sprintf("PIDS=`ps ux | grep \"%s\" | grep -v grep | awk '{print $2}'`; if [[ ! -z \"$PIDS\" ]]; then kill -USR1 $PIDS; fi", procPattern)
   209  	})
   210  	c.CheckClusterError(remoteOutput, "Unable to clean up agent processes", func(contentID int) string {
   211  		return "Unable to clean up agent process"
   212  	})
   213  }
   214  
   215  func CheckAgentErrorsOnSegments(c *cluster.Cluster, fpInfo filepath.FilePathInfo) error {
   216  	remoteOutput := c.GenerateAndExecuteCommand("Checking whether segment agents had errors", cluster.ON_SEGMENTS, func(contentID int) string {
   217  		errorFile := fmt.Sprintf("%s_error", fpInfo.GetSegmentPipeFilePath(contentID))
   218  		/*
   219  		 * If an error file exists we want to indicate an error, as that means
   220  		 * the agent errored out.  If no file exists, the agent was successful.
   221  		 */
   222  		return fmt.Sprintf("if [[ -f %s ]]; then echo 'error'; fi; rm -f %s", errorFile, errorFile)
   223  	})
   224  
   225  	numErrors := 0
   226  	for contentID, cmd := range remoteOutput.Commands {
   227  		if strings.TrimSpace(cmd.Stdout) == "error" {
   228  			gplog.Verbose("Error occurred with helper agent on segment %d on host %s.", contentID, c.GetHostForContent(contentID))
   229  			numErrors++
   230  		}
   231  	}
   232  	if numErrors > 0 {
   233  		helperLogName := fpInfo.GetHelperLogPath()
   234  		return errors.Errorf("Encountered errors with %d helper agent(s).  See %s for a complete list of segments with errors, and see %s on the corresponding hosts for detailed error messages.",
   235  			numErrors, gplog.GetLogFilePath(), helperLogName)
   236  	}
   237  	return nil
   238  }
   239  
   240  func CreateSkipFileOnSegments(oid string, tableName string, c *cluster.Cluster, fpInfo filepath.FilePathInfo) {
   241  	createSkipFileLogMsg := fmt.Sprintf("Creating skip file on segments for restore entry %s (%s)", oid, tableName)
   242  	remoteOutput := c.GenerateAndExecuteCommand(createSkipFileLogMsg, cluster.ON_SEGMENTS, func(contentID int) string {
   243  		return fmt.Sprintf("touch %s_skip_%s", fpInfo.GetSegmentPipeFilePath(contentID), oid)
   244  	})
   245  	c.CheckClusterError(remoteOutput, "Error while creating skip file on segments", func(contentID int) string {
   246  		return fmt.Sprintf("Could not create skip file %s_skip_%s on segments", fpInfo.GetSegmentPipeFilePath(contentID), oid)
   247  	})
   248  }