github.com/MetalBlockchain/metalgo@v1.11.9/tests/fixture/tmpnet/node_process.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package tmpnet
     5  
     6  import (
     7  	"context"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"io/fs"
    13  	"net"
    14  	"os"
    15  	"os/exec"
    16  	"path/filepath"
    17  	"strconv"
    18  	"strings"
    19  	"syscall"
    20  	"time"
    21  
    22  	"github.com/MetalBlockchain/metalgo/api/health"
    23  	"github.com/MetalBlockchain/metalgo/config"
    24  	"github.com/MetalBlockchain/metalgo/node"
    25  	"github.com/MetalBlockchain/metalgo/utils/perms"
    26  )
    27  
    28  const (
    29  	AvalancheGoPathEnvName = "METALGO_PATH"
    30  
    31  	defaultNodeInitTimeout = 10 * time.Second
    32  )
    33  
    34  var errNodeAlreadyRunning = errors.New("failed to start node: node is already running")
    35  
    36  func checkNodeHealth(ctx context.Context, uri string) (bool, error) {
    37  	// Check that the node is reporting healthy
    38  	health, err := health.NewClient(uri).Health(ctx, nil)
    39  	if err == nil {
    40  		return health.Healthy, nil
    41  	}
    42  
    43  	switch t := err.(type) {
    44  	case *net.OpError:
    45  		if t.Op == "read" {
    46  			// Connection refused - potentially recoverable
    47  			return false, nil
    48  		}
    49  	case syscall.Errno:
    50  		if t == syscall.ECONNREFUSED {
    51  			// Connection refused - potentially recoverable
    52  			return false, nil
    53  		}
    54  	}
    55  	// Assume all other errors are not recoverable
    56  	return false, fmt.Errorf("failed to query node health: %w", err)
    57  }
    58  
    59  // Defines local-specific node configuration. Supports setting default
    60  // and node-specific values.
    61  type NodeProcess struct {
    62  	node *Node
    63  
    64  	// PID of the node process
    65  	pid int
    66  }
    67  
    68  func (p *NodeProcess) setProcessContext(processContext node.NodeProcessContext) {
    69  	p.pid = processContext.PID
    70  	p.node.URI = processContext.URI
    71  	p.node.StakingAddress = processContext.StakingAddress
    72  }
    73  
    74  func (p *NodeProcess) readState() error {
    75  	path := p.getProcessContextPath()
    76  	if _, err := os.Stat(path); errors.Is(err, fs.ErrNotExist) {
    77  		// The absence of the process context file indicates the node is not running
    78  		p.setProcessContext(node.NodeProcessContext{})
    79  		return nil
    80  	}
    81  
    82  	bytes, err := os.ReadFile(path)
    83  	if err != nil {
    84  		return fmt.Errorf("failed to read node process context: %w", err)
    85  	}
    86  	processContext := node.NodeProcessContext{}
    87  	if err := json.Unmarshal(bytes, &processContext); err != nil {
    88  		return fmt.Errorf("failed to unmarshal node process context: %w", err)
    89  	}
    90  	p.setProcessContext(processContext)
    91  	return nil
    92  }
    93  
    94  // Start waits for the process context to be written which
    95  // indicates that the node will be accepting connections on
    96  // its staking port. The network will start faster with this
    97  // synchronization due to the avoidance of exponential backoff
    98  // if a node tries to connect to a beacon that is not ready.
    99  func (p *NodeProcess) Start(w io.Writer) error {
   100  	// Avoid attempting to start an already running node.
   101  	proc, err := p.getProcess()
   102  	if err != nil {
   103  		return fmt.Errorf("failed to retrieve existing process: %w", err)
   104  	}
   105  	if proc != nil {
   106  		return errNodeAlreadyRunning
   107  	}
   108  
   109  	// Ensure a stale process context file is removed so that the
   110  	// creation of a new file can indicate node start.
   111  	if err := os.Remove(p.getProcessContextPath()); err != nil && !errors.Is(err, fs.ErrNotExist) {
   112  		return fmt.Errorf("failed to remove stale process context file: %w", err)
   113  	}
   114  
   115  	// All arguments are provided in the flags file
   116  	cmd := exec.Command(p.node.RuntimeConfig.AvalancheGoPath, "--config-file", p.node.getFlagsPath()) // #nosec G204
   117  	// Ensure process is detached from the parent process so that an error in the parent will not affect the child
   118  	configureDetachedProcess(cmd)
   119  
   120  	if err := cmd.Start(); err != nil {
   121  		return err
   122  	}
   123  
   124  	// Determine appropriate level of node description detail
   125  	dataDir := p.node.GetDataDir()
   126  	nodeDescription := fmt.Sprintf("node %q", p.node.NodeID)
   127  	if p.node.IsEphemeral {
   128  		nodeDescription = "ephemeral " + nodeDescription
   129  	}
   130  	nonDefaultNodeDir := filepath.Base(dataDir) != p.node.NodeID.String()
   131  	if nonDefaultNodeDir {
   132  		// Only include the data dir if its base is not the default (the node ID)
   133  		nodeDescription = fmt.Sprintf("%s with path: %s", nodeDescription, dataDir)
   134  	}
   135  
   136  	// A node writes a process context file on start. If the file is not
   137  	// found in a reasonable amount of time, the node is unlikely to have
   138  	// started successfully.
   139  	if err := p.waitForProcessContext(context.Background()); err != nil {
   140  		return fmt.Errorf("failed to start local node: %w", err)
   141  	}
   142  
   143  	if _, err = fmt.Fprintf(w, "Started %s\n", nodeDescription); err != nil {
   144  		return err
   145  	}
   146  
   147  	// Configure collection of metrics and logs
   148  	return p.writeMonitoringConfig()
   149  }
   150  
   151  // Signals the node process to stop.
   152  func (p *NodeProcess) InitiateStop() error {
   153  	proc, err := p.getProcess()
   154  	if err != nil {
   155  		return fmt.Errorf("failed to retrieve process to stop: %w", err)
   156  	}
   157  	if proc == nil {
   158  		// Already stopped
   159  		return p.removeMonitoringConfig()
   160  	}
   161  	if err := proc.Signal(syscall.SIGTERM); err != nil {
   162  		return fmt.Errorf("failed to send SIGTERM to pid %d: %w", p.pid, err)
   163  	}
   164  	return nil
   165  }
   166  
   167  // Waits for the node process to stop.
   168  func (p *NodeProcess) WaitForStopped(ctx context.Context) error {
   169  	ticker := time.NewTicker(defaultNodeTickerInterval)
   170  	defer ticker.Stop()
   171  	for {
   172  		proc, err := p.getProcess()
   173  		if err != nil {
   174  			return fmt.Errorf("failed to retrieve process: %w", err)
   175  		}
   176  		if proc == nil {
   177  			return p.removeMonitoringConfig()
   178  		}
   179  
   180  		select {
   181  		case <-ctx.Done():
   182  			return fmt.Errorf("failed to see node process stop %q before timeout: %w", p.node.NodeID, ctx.Err())
   183  		case <-ticker.C:
   184  		}
   185  	}
   186  }
   187  
   188  func (p *NodeProcess) IsHealthy(ctx context.Context) (bool, error) {
   189  	// Check that the node process is running as a precondition for
   190  	// checking health. getProcess will also ensure that the node's
   191  	// API URI is current.
   192  	proc, err := p.getProcess()
   193  	if err != nil {
   194  		return false, fmt.Errorf("failed to determine process status: %w", err)
   195  	}
   196  	if proc == nil {
   197  		return false, ErrNotRunning
   198  	}
   199  
   200  	return checkNodeHealth(ctx, p.node.URI)
   201  }
   202  
   203  func (p *NodeProcess) getProcessContextPath() string {
   204  	return filepath.Join(p.node.GetDataDir(), config.DefaultProcessContextFilename)
   205  }
   206  
   207  func (p *NodeProcess) waitForProcessContext(ctx context.Context) error {
   208  	ticker := time.NewTicker(defaultNodeTickerInterval)
   209  	defer ticker.Stop()
   210  
   211  	ctx, cancel := context.WithTimeout(ctx, defaultNodeInitTimeout)
   212  	defer cancel()
   213  	for len(p.node.URI) == 0 {
   214  		err := p.readState()
   215  		if err != nil {
   216  			return fmt.Errorf("failed to read process context for node %q: %w", p.node.NodeID, err)
   217  		}
   218  
   219  		select {
   220  		case <-ctx.Done():
   221  			return fmt.Errorf("failed to load process context for node %q before timeout: %w", p.node.NodeID, ctx.Err())
   222  		case <-ticker.C:
   223  		}
   224  	}
   225  	return nil
   226  }
   227  
   228  // Retrieve the node process if it is running. As part of determining
   229  // process liveness, the node's process context will be refreshed if
   230  // live or cleared if not running.
   231  func (p *NodeProcess) getProcess() (*os.Process, error) {
   232  	// Read the process context to ensure freshness. The node may have
   233  	// stopped or been restarted since last read.
   234  	if err := p.readState(); err != nil {
   235  		return nil, fmt.Errorf("failed to read process context: %w", err)
   236  	}
   237  
   238  	if p.pid == 0 {
   239  		// Process is not running
   240  		return nil, nil
   241  	}
   242  
   243  	proc, err := os.FindProcess(p.pid)
   244  	if err != nil {
   245  		return nil, fmt.Errorf("failed to find process: %w", err)
   246  	}
   247  
   248  	// Sending 0 will not actually send a signal but will perform
   249  	// error checking.
   250  	err = proc.Signal(syscall.Signal(0))
   251  	if err == nil {
   252  		// Process is running
   253  		return proc, nil
   254  	}
   255  	if errors.Is(err, os.ErrProcessDone) {
   256  		// Process is not running
   257  		return nil, nil
   258  	}
   259  	return nil, fmt.Errorf("failed to determine process status: %w", err)
   260  }
   261  
   262  // Write monitoring configuration enabling collection of metrics and logs from the node.
   263  func (p *NodeProcess) writeMonitoringConfig() error {
   264  	// Ensure labeling that uniquely identifies the node and its network
   265  	commonLabels := FlagsMap{
   266  		"network_uuid":      p.node.NetworkUUID,
   267  		"node_id":           p.node.NodeID,
   268  		"is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral),
   269  		"network_owner":     p.node.NetworkOwner,
   270  		// prometheus/promtail ignore empty values so including these
   271  		// labels with empty values outside of a github worker (where
   272  		// the env vars will not be set) should not be a problem.
   273  		"gh_repo":        os.Getenv("GH_REPO"),
   274  		"gh_workflow":    os.Getenv("GH_WORKFLOW"),
   275  		"gh_run_id":      os.Getenv("GH_RUN_ID"),
   276  		"gh_run_number":  os.Getenv("GH_RUN_NUMBER"),
   277  		"gh_run_attempt": os.Getenv("GH_RUN_ATTEMPT"),
   278  		"gh_job_id":      os.Getenv("GH_JOB_ID"),
   279  	}
   280  
   281  	tmpnetDir, err := getTmpnetPath()
   282  	if err != nil {
   283  		return err
   284  	}
   285  
   286  	prometheusConfig := []FlagsMap{
   287  		{
   288  			"targets": []string{strings.TrimPrefix(p.node.URI, "http://")},
   289  			"labels":  commonLabels,
   290  		},
   291  	}
   292  	if err := p.writeMonitoringConfigFile(tmpnetDir, "prometheus", prometheusConfig); err != nil {
   293  		return err
   294  	}
   295  
   296  	promtailLabels := FlagsMap{
   297  		"__path__": filepath.Join(p.node.GetDataDir(), "logs", "*.log"),
   298  	}
   299  	promtailLabels.SetDefaults(commonLabels)
   300  	promtailConfig := []FlagsMap{
   301  		{
   302  			"targets": []string{"localhost"},
   303  			"labels":  promtailLabels,
   304  		},
   305  	}
   306  	return p.writeMonitoringConfigFile(tmpnetDir, "promtail", promtailConfig)
   307  }
   308  
   309  // Return the path for this node's prometheus configuration.
   310  func (p *NodeProcess) getMonitoringConfigPath(tmpnetDir string, name string) string {
   311  	// Ensure a unique filename to allow config files to be added and removed
   312  	// by multiple nodes without conflict.
   313  	return filepath.Join(tmpnetDir, name, "file_sd_configs", fmt.Sprintf("%s_%s.json", p.node.NetworkUUID, p.node.NodeID))
   314  }
   315  
   316  // Ensure the removal of the prometheus configuration file for this node.
   317  func (p *NodeProcess) removeMonitoringConfig() error {
   318  	tmpnetDir, err := getTmpnetPath()
   319  	if err != nil {
   320  		return err
   321  	}
   322  
   323  	for _, name := range []string{"promtail", "prometheus"} {
   324  		configPath := p.getMonitoringConfigPath(tmpnetDir, name)
   325  		if err := os.Remove(configPath); err != nil && !errors.Is(err, fs.ErrNotExist) {
   326  			return fmt.Errorf("failed to remove %s config: %w", name, err)
   327  		}
   328  	}
   329  
   330  	return nil
   331  }
   332  
   333  // Write the configuration for a type of monitoring (e.g. prometheus, promtail).
   334  func (p *NodeProcess) writeMonitoringConfigFile(tmpnetDir string, name string, config []FlagsMap) error {
   335  	configPath := p.getMonitoringConfigPath(tmpnetDir, name)
   336  
   337  	dir := filepath.Dir(configPath)
   338  	if err := os.MkdirAll(dir, perms.ReadWriteExecute); err != nil {
   339  		return fmt.Errorf("failed to create %s service discovery dir: %w", name, err)
   340  	}
   341  
   342  	bytes, err := DefaultJSONMarshal(config)
   343  	if err != nil {
   344  		return fmt.Errorf("failed to marshal %s config: %w", name, err)
   345  	}
   346  
   347  	if err := os.WriteFile(configPath, bytes, perms.ReadWrite); err != nil {
   348  		return fmt.Errorf("failed to write %s config: %w", name, err)
   349  	}
   350  
   351  	return nil
   352  }