github.com/MetalBlockchain/metalgo@v1.11.9/tests/fixture/tmpnet/node_process.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package tmpnet 5 6 import ( 7 "context" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "io" 12 "io/fs" 13 "net" 14 "os" 15 "os/exec" 16 "path/filepath" 17 "strconv" 18 "strings" 19 "syscall" 20 "time" 21 22 "github.com/MetalBlockchain/metalgo/api/health" 23 "github.com/MetalBlockchain/metalgo/config" 24 "github.com/MetalBlockchain/metalgo/node" 25 "github.com/MetalBlockchain/metalgo/utils/perms" 26 ) 27 28 const ( 29 AvalancheGoPathEnvName = "METALGO_PATH" 30 31 defaultNodeInitTimeout = 10 * time.Second 32 ) 33 34 var errNodeAlreadyRunning = errors.New("failed to start node: node is already running") 35 36 func checkNodeHealth(ctx context.Context, uri string) (bool, error) { 37 // Check that the node is reporting healthy 38 health, err := health.NewClient(uri).Health(ctx, nil) 39 if err == nil { 40 return health.Healthy, nil 41 } 42 43 switch t := err.(type) { 44 case *net.OpError: 45 if t.Op == "read" { 46 // Connection refused - potentially recoverable 47 return false, nil 48 } 49 case syscall.Errno: 50 if t == syscall.ECONNREFUSED { 51 // Connection refused - potentially recoverable 52 return false, nil 53 } 54 } 55 // Assume all other errors are not recoverable 56 return false, fmt.Errorf("failed to query node health: %w", err) 57 } 58 59 // Defines local-specific node configuration. Supports setting default 60 // and node-specific values. 61 type NodeProcess struct { 62 node *Node 63 64 // PID of the node process 65 pid int 66 } 67 68 func (p *NodeProcess) setProcessContext(processContext node.NodeProcessContext) { 69 p.pid = processContext.PID 70 p.node.URI = processContext.URI 71 p.node.StakingAddress = processContext.StakingAddress 72 } 73 74 func (p *NodeProcess) readState() error { 75 path := p.getProcessContextPath() 76 if _, err := os.Stat(path); errors.Is(err, fs.ErrNotExist) { 77 // The absence of the process context file indicates the node is not running 78 p.setProcessContext(node.NodeProcessContext{}) 79 return nil 80 } 81 82 bytes, err := os.ReadFile(path) 83 if err != nil { 84 return fmt.Errorf("failed to read node process context: %w", err) 85 } 86 processContext := node.NodeProcessContext{} 87 if err := json.Unmarshal(bytes, &processContext); err != nil { 88 return fmt.Errorf("failed to unmarshal node process context: %w", err) 89 } 90 p.setProcessContext(processContext) 91 return nil 92 } 93 94 // Start waits for the process context to be written which 95 // indicates that the node will be accepting connections on 96 // its staking port. The network will start faster with this 97 // synchronization due to the avoidance of exponential backoff 98 // if a node tries to connect to a beacon that is not ready. 99 func (p *NodeProcess) Start(w io.Writer) error { 100 // Avoid attempting to start an already running node. 101 proc, err := p.getProcess() 102 if err != nil { 103 return fmt.Errorf("failed to retrieve existing process: %w", err) 104 } 105 if proc != nil { 106 return errNodeAlreadyRunning 107 } 108 109 // Ensure a stale process context file is removed so that the 110 // creation of a new file can indicate node start. 111 if err := os.Remove(p.getProcessContextPath()); err != nil && !errors.Is(err, fs.ErrNotExist) { 112 return fmt.Errorf("failed to remove stale process context file: %w", err) 113 } 114 115 // All arguments are provided in the flags file 116 cmd := exec.Command(p.node.RuntimeConfig.AvalancheGoPath, "--config-file", p.node.getFlagsPath()) // #nosec G204 117 // Ensure process is detached from the parent process so that an error in the parent will not affect the child 118 configureDetachedProcess(cmd) 119 120 if err := cmd.Start(); err != nil { 121 return err 122 } 123 124 // Determine appropriate level of node description detail 125 dataDir := p.node.GetDataDir() 126 nodeDescription := fmt.Sprintf("node %q", p.node.NodeID) 127 if p.node.IsEphemeral { 128 nodeDescription = "ephemeral " + nodeDescription 129 } 130 nonDefaultNodeDir := filepath.Base(dataDir) != p.node.NodeID.String() 131 if nonDefaultNodeDir { 132 // Only include the data dir if its base is not the default (the node ID) 133 nodeDescription = fmt.Sprintf("%s with path: %s", nodeDescription, dataDir) 134 } 135 136 // A node writes a process context file on start. If the file is not 137 // found in a reasonable amount of time, the node is unlikely to have 138 // started successfully. 139 if err := p.waitForProcessContext(context.Background()); err != nil { 140 return fmt.Errorf("failed to start local node: %w", err) 141 } 142 143 if _, err = fmt.Fprintf(w, "Started %s\n", nodeDescription); err != nil { 144 return err 145 } 146 147 // Configure collection of metrics and logs 148 return p.writeMonitoringConfig() 149 } 150 151 // Signals the node process to stop. 152 func (p *NodeProcess) InitiateStop() error { 153 proc, err := p.getProcess() 154 if err != nil { 155 return fmt.Errorf("failed to retrieve process to stop: %w", err) 156 } 157 if proc == nil { 158 // Already stopped 159 return p.removeMonitoringConfig() 160 } 161 if err := proc.Signal(syscall.SIGTERM); err != nil { 162 return fmt.Errorf("failed to send SIGTERM to pid %d: %w", p.pid, err) 163 } 164 return nil 165 } 166 167 // Waits for the node process to stop. 168 func (p *NodeProcess) WaitForStopped(ctx context.Context) error { 169 ticker := time.NewTicker(defaultNodeTickerInterval) 170 defer ticker.Stop() 171 for { 172 proc, err := p.getProcess() 173 if err != nil { 174 return fmt.Errorf("failed to retrieve process: %w", err) 175 } 176 if proc == nil { 177 return p.removeMonitoringConfig() 178 } 179 180 select { 181 case <-ctx.Done(): 182 return fmt.Errorf("failed to see node process stop %q before timeout: %w", p.node.NodeID, ctx.Err()) 183 case <-ticker.C: 184 } 185 } 186 } 187 188 func (p *NodeProcess) IsHealthy(ctx context.Context) (bool, error) { 189 // Check that the node process is running as a precondition for 190 // checking health. getProcess will also ensure that the node's 191 // API URI is current. 192 proc, err := p.getProcess() 193 if err != nil { 194 return false, fmt.Errorf("failed to determine process status: %w", err) 195 } 196 if proc == nil { 197 return false, ErrNotRunning 198 } 199 200 return checkNodeHealth(ctx, p.node.URI) 201 } 202 203 func (p *NodeProcess) getProcessContextPath() string { 204 return filepath.Join(p.node.GetDataDir(), config.DefaultProcessContextFilename) 205 } 206 207 func (p *NodeProcess) waitForProcessContext(ctx context.Context) error { 208 ticker := time.NewTicker(defaultNodeTickerInterval) 209 defer ticker.Stop() 210 211 ctx, cancel := context.WithTimeout(ctx, defaultNodeInitTimeout) 212 defer cancel() 213 for len(p.node.URI) == 0 { 214 err := p.readState() 215 if err != nil { 216 return fmt.Errorf("failed to read process context for node %q: %w", p.node.NodeID, err) 217 } 218 219 select { 220 case <-ctx.Done(): 221 return fmt.Errorf("failed to load process context for node %q before timeout: %w", p.node.NodeID, ctx.Err()) 222 case <-ticker.C: 223 } 224 } 225 return nil 226 } 227 228 // Retrieve the node process if it is running. As part of determining 229 // process liveness, the node's process context will be refreshed if 230 // live or cleared if not running. 231 func (p *NodeProcess) getProcess() (*os.Process, error) { 232 // Read the process context to ensure freshness. The node may have 233 // stopped or been restarted since last read. 234 if err := p.readState(); err != nil { 235 return nil, fmt.Errorf("failed to read process context: %w", err) 236 } 237 238 if p.pid == 0 { 239 // Process is not running 240 return nil, nil 241 } 242 243 proc, err := os.FindProcess(p.pid) 244 if err != nil { 245 return nil, fmt.Errorf("failed to find process: %w", err) 246 } 247 248 // Sending 0 will not actually send a signal but will perform 249 // error checking. 250 err = proc.Signal(syscall.Signal(0)) 251 if err == nil { 252 // Process is running 253 return proc, nil 254 } 255 if errors.Is(err, os.ErrProcessDone) { 256 // Process is not running 257 return nil, nil 258 } 259 return nil, fmt.Errorf("failed to determine process status: %w", err) 260 } 261 262 // Write monitoring configuration enabling collection of metrics and logs from the node. 263 func (p *NodeProcess) writeMonitoringConfig() error { 264 // Ensure labeling that uniquely identifies the node and its network 265 commonLabels := FlagsMap{ 266 "network_uuid": p.node.NetworkUUID, 267 "node_id": p.node.NodeID, 268 "is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral), 269 "network_owner": p.node.NetworkOwner, 270 // prometheus/promtail ignore empty values so including these 271 // labels with empty values outside of a github worker (where 272 // the env vars will not be set) should not be a problem. 273 "gh_repo": os.Getenv("GH_REPO"), 274 "gh_workflow": os.Getenv("GH_WORKFLOW"), 275 "gh_run_id": os.Getenv("GH_RUN_ID"), 276 "gh_run_number": os.Getenv("GH_RUN_NUMBER"), 277 "gh_run_attempt": os.Getenv("GH_RUN_ATTEMPT"), 278 "gh_job_id": os.Getenv("GH_JOB_ID"), 279 } 280 281 tmpnetDir, err := getTmpnetPath() 282 if err != nil { 283 return err 284 } 285 286 prometheusConfig := []FlagsMap{ 287 { 288 "targets": []string{strings.TrimPrefix(p.node.URI, "http://")}, 289 "labels": commonLabels, 290 }, 291 } 292 if err := p.writeMonitoringConfigFile(tmpnetDir, "prometheus", prometheusConfig); err != nil { 293 return err 294 } 295 296 promtailLabels := FlagsMap{ 297 "__path__": filepath.Join(p.node.GetDataDir(), "logs", "*.log"), 298 } 299 promtailLabels.SetDefaults(commonLabels) 300 promtailConfig := []FlagsMap{ 301 { 302 "targets": []string{"localhost"}, 303 "labels": promtailLabels, 304 }, 305 } 306 return p.writeMonitoringConfigFile(tmpnetDir, "promtail", promtailConfig) 307 } 308 309 // Return the path for this node's prometheus configuration. 310 func (p *NodeProcess) getMonitoringConfigPath(tmpnetDir string, name string) string { 311 // Ensure a unique filename to allow config files to be added and removed 312 // by multiple nodes without conflict. 313 return filepath.Join(tmpnetDir, name, "file_sd_configs", fmt.Sprintf("%s_%s.json", p.node.NetworkUUID, p.node.NodeID)) 314 } 315 316 // Ensure the removal of the prometheus configuration file for this node. 317 func (p *NodeProcess) removeMonitoringConfig() error { 318 tmpnetDir, err := getTmpnetPath() 319 if err != nil { 320 return err 321 } 322 323 for _, name := range []string{"promtail", "prometheus"} { 324 configPath := p.getMonitoringConfigPath(tmpnetDir, name) 325 if err := os.Remove(configPath); err != nil && !errors.Is(err, fs.ErrNotExist) { 326 return fmt.Errorf("failed to remove %s config: %w", name, err) 327 } 328 } 329 330 return nil 331 } 332 333 // Write the configuration for a type of monitoring (e.g. prometheus, promtail). 334 func (p *NodeProcess) writeMonitoringConfigFile(tmpnetDir string, name string, config []FlagsMap) error { 335 configPath := p.getMonitoringConfigPath(tmpnetDir, name) 336 337 dir := filepath.Dir(configPath) 338 if err := os.MkdirAll(dir, perms.ReadWriteExecute); err != nil { 339 return fmt.Errorf("failed to create %s service discovery dir: %w", name, err) 340 } 341 342 bytes, err := DefaultJSONMarshal(config) 343 if err != nil { 344 return fmt.Errorf("failed to marshal %s config: %w", name, err) 345 } 346 347 if err := os.WriteFile(configPath, bytes, perms.ReadWrite); err != nil { 348 return fmt.Errorf("failed to write %s config: %w", name, err) 349 } 350 351 return nil 352 }