github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/cmd/bacalhau/devstack.go (about)

     1  package bacalhau
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"path/filepath"
     7  	"strconv"
     8  
     9  	"github.com/filecoin-project/bacalhau/pkg/config"
    10  	"github.com/filecoin-project/bacalhau/pkg/devstack"
    11  	"github.com/filecoin-project/bacalhau/pkg/node"
    12  	"github.com/filecoin-project/bacalhau/pkg/system"
    13  	"github.com/filecoin-project/bacalhau/pkg/telemetry"
    14  	"github.com/filecoin-project/bacalhau/pkg/util/templates"
    15  	"k8s.io/kubectl/pkg/util/i18n"
    16  
    17  	"github.com/spf13/cobra"
    18  )
    19  
    20  var (
    21  	devStackLong = templates.LongDesc(i18n.T(`
    22  		Start a cluster of nodes and run a job on them.
    23  `))
    24  
    25  	//nolint:lll // Documentation
    26  	devstackExample = templates.Examples(i18n.T(`
    27  		# Create a devstack cluster with a single requester node and 3 compute nodes (Default values)
    28  		bacalhau devstack 
    29  
    30  		# Create a devstack cluster with a two requester nodes and 10 compute nodes
    31  		bacalhau devstack  --requester-nodes 2 --compute-nodes 10
    32  
    33  		# Create a devstack cluster with a single hybrid (requester and compute) nodes
    34  		bacalhau devstack  --requester-nodes 0 --compute-nodes 0 --hybrid-nodes 1 
    35  `))
    36  )
    37  
    38  func newDevStackOptions() *devstack.DevStackOptions {
    39  	return &devstack.DevStackOptions{
    40  		NumberOfRequesterOnlyNodes: 1,
    41  		NumberOfComputeOnlyNodes:   3,
    42  		NumberOfBadComputeActors:   0,
    43  		Peer:                       "",
    44  		PublicIPFSMode:             false,
    45  		EstuaryAPIKey:              os.Getenv("ESTUARY_API_KEY"),
    46  		LocalNetworkLotus:          false,
    47  		SimulatorAddr:              "",
    48  		SimulatorMode:              false,
    49  		CPUProfilingFile:           "",
    50  		MemoryProfilingFile:        "",
    51  	}
    52  }
    53  
    54  func newDevStackCmd() *cobra.Command {
    55  	ODs := newDevStackOptions()
    56  	OS := NewServeOptions()
    57  	IsNoop := false
    58  
    59  	devstackCmd := &cobra.Command{
    60  		Use:     "devstack",
    61  		Short:   "Start a cluster of bacalhau nodes for testing and development",
    62  		Long:    devStackLong,
    63  		Example: devstackExample,
    64  		RunE: func(cmd *cobra.Command, _ []string) error {
    65  			return runDevstack(cmd, ODs, OS, IsNoop)
    66  		},
    67  	}
    68  
    69  	devstackCmd.PersistentFlags().IntVar(
    70  		&ODs.NumberOfHybridNodes, "hybrid-nodes", ODs.NumberOfHybridNodes,
    71  		`How many hybrid (requester and compute) nodes should be started in the cluster`,
    72  	)
    73  	devstackCmd.PersistentFlags().IntVar(
    74  		&ODs.NumberOfRequesterOnlyNodes, "requester-nodes", ODs.NumberOfRequesterOnlyNodes,
    75  		`How many requester only nodes should be started in the cluster`,
    76  	)
    77  	devstackCmd.PersistentFlags().IntVar(
    78  		&ODs.NumberOfComputeOnlyNodes, "compute-nodes", ODs.NumberOfComputeOnlyNodes,
    79  		`How many compute only nodes should be started in the cluster`,
    80  	)
    81  	devstackCmd.PersistentFlags().IntVar(
    82  		&ODs.NumberOfBadComputeActors, "bad-compute-actors", ODs.NumberOfBadComputeActors,
    83  		`How many compute nodes should be bad actors`,
    84  	)
    85  	devstackCmd.PersistentFlags().IntVar(
    86  		&ODs.NumberOfBadRequesterActors, "bad-requester-actors", ODs.NumberOfBadRequesterActors,
    87  		`How many requester nodes should be bad actors`,
    88  	)
    89  	devstackCmd.PersistentFlags().BoolVar(
    90  		&IsNoop, "noop", false,
    91  		`Use the noop executor and verifier for all jobs`,
    92  	)
    93  	devstackCmd.PersistentFlags().StringVar(
    94  		&ODs.Peer, "peer", ODs.Peer,
    95  		`Connect node 0 to another network node`,
    96  	)
    97  	devstackCmd.PersistentFlags().BoolVar(
    98  		&ODs.LocalNetworkLotus, "lotus-node", ODs.LocalNetworkLotus,
    99  		"Also start a Lotus FileCoin instance",
   100  	)
   101  	devstackCmd.PersistentFlags().StringVar(
   102  		&ODs.SimulatorAddr, "simulator-addr", ODs.SimulatorAddr,
   103  		`Use the simulator transport at the given node multi addr`,
   104  	)
   105  	devstackCmd.PersistentFlags().BoolVar(
   106  		&ODs.SimulatorMode, "simulator-mode", false,
   107  		`If set, one of the nodes will act as a simulator and will proxy all requests to the other nodes`,
   108  	)
   109  	devstackCmd.PersistentFlags().BoolVar(
   110  		&ODs.PublicIPFSMode, "public-ipfs", ODs.PublicIPFSMode,
   111  		`Connect devstack to public IPFS`,
   112  	)
   113  	devstackCmd.PersistentFlags().StringVar(
   114  		&ODs.CPUProfilingFile, "cpu-profiling-file", ODs.CPUProfilingFile,
   115  		"File to save CPU profiling to",
   116  	)
   117  	devstackCmd.PersistentFlags().StringVar(
   118  		&ODs.MemoryProfilingFile, "memory-profiling-file", ODs.MemoryProfilingFile,
   119  		"File to save memory profiling to",
   120  	)
   121  
   122  	setupJobSelectionCLIFlags(devstackCmd, OS)
   123  	setupCapacityManagerCLIFlags(devstackCmd, OS)
   124  
   125  	return devstackCmd
   126  }
   127  
   128  func runDevstack(cmd *cobra.Command, ODs *devstack.DevStackOptions, OS *ServeOptions, IsNoop bool) error {
   129  	ctx := cmd.Context()
   130  
   131  	cm := ctx.Value(systemManagerKey).(*system.CleanupManager)
   132  
   133  	if config.DevstackShouldWriteEnvFile() {
   134  		cm.RegisterCallback(cleanupDevstackDotEnv)
   135  	}
   136  	cm.RegisterCallback(telemetry.Cleanup)
   137  
   138  	config.DevstackSetShouldPrintInfo()
   139  
   140  	totalComputeNodes := ODs.NumberOfComputeOnlyNodes + ODs.NumberOfHybridNodes
   141  	totalRequesterNodes := ODs.NumberOfRequesterOnlyNodes + ODs.NumberOfHybridNodes
   142  	if ODs.NumberOfBadComputeActors > totalComputeNodes {
   143  		Fatal(cmd, fmt.Sprintf("You cannot have more bad compute actors (%d) than there are nodes (%d).",
   144  			ODs.NumberOfBadComputeActors, totalComputeNodes), 1)
   145  	}
   146  	if ODs.NumberOfBadRequesterActors > totalRequesterNodes {
   147  		Fatal(cmd, fmt.Sprintf("You cannot have more bad requester actors (%d) than there are nodes (%d).",
   148  			ODs.NumberOfBadRequesterActors, totalRequesterNodes), 1)
   149  	}
   150  
   151  	portFileName := filepath.Join(os.TempDir(), "bacalhau-devstack.port")
   152  	pidFileName := filepath.Join(os.TempDir(), "bacalhau-devstack.pid")
   153  
   154  	if _, ignore := os.LookupEnv("IGNORE_PID_AND_PORT_FILES"); !ignore {
   155  		_, err := os.Stat(portFileName)
   156  		if err == nil {
   157  			Fatal(cmd, fmt.Sprintf("Found file %s - Devstack likely already running", portFileName), 1)
   158  		}
   159  		_, err = os.Stat(pidFileName)
   160  		if err == nil {
   161  			Fatal(cmd, fmt.Sprintf("Found file %s - Devstack likely already running", pidFileName), 1)
   162  		}
   163  	}
   164  
   165  	computeConfig := getComputeConfig(OS)
   166  	if ODs.LocalNetworkLotus {
   167  		cmd.Println("Note that starting up the Lotus node can take many minutes!")
   168  	}
   169  
   170  	var stack *devstack.DevStack
   171  	var stackErr error
   172  	if IsNoop {
   173  		stack, stackErr = devstack.NewNoopDevStack(ctx, cm, *ODs, computeConfig, node.NewRequesterConfigWithDefaults())
   174  	} else {
   175  		stack, stackErr = devstack.NewStandardDevStack(ctx, cm, *ODs, computeConfig, node.NewRequesterConfigWithDefaults())
   176  	}
   177  	if stackErr != nil {
   178  		return stackErr
   179  	}
   180  
   181  	nodeInfoOutput, err := stack.PrintNodeInfo(ctx)
   182  	if err != nil {
   183  		Fatal(cmd, fmt.Sprintf("Failed to print node info: %s", err.Error()), 1)
   184  	}
   185  	cmd.Println(nodeInfoOutput)
   186  
   187  	f, err := os.Create(portFileName)
   188  	if err != nil {
   189  		Fatal(cmd, fmt.Sprintf("Error writing out port file to %v", portFileName), 1)
   190  	}
   191  	defer os.Remove(portFileName)
   192  	firstNode := stack.Nodes[0]
   193  	_, err = f.WriteString(strconv.Itoa(firstNode.APIServer.Port))
   194  	if err != nil {
   195  		Fatal(cmd, fmt.Sprintf("Error writing out port file: %v", portFileName), 1)
   196  	}
   197  
   198  	fPid, err := os.Create(pidFileName)
   199  	if err != nil {
   200  		Fatal(cmd, fmt.Sprintf("Error writing out pid file to %v", pidFileName), 1)
   201  	}
   202  	defer os.Remove(pidFileName)
   203  
   204  	_, err = fPid.WriteString(strconv.Itoa(os.Getpid()))
   205  	if err != nil {
   206  		Fatal(cmd, fmt.Sprintf("Error writing out pid file: %v", pidFileName), 1)
   207  	}
   208  
   209  	<-ctx.Done() // block until killed
   210  
   211  	cmd.Println("Shutting down devstack")
   212  	return nil
   213  }
   214  
   215  func cleanupDevstackDotEnv() error {
   216  	if _, err := os.Stat(config.DevstackEnvFile()); err == nil {
   217  		return os.Remove(config.DevstackEnvFile())
   218  	}
   219  	return nil
   220  }