github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/cmd/bacalhau/serve.go (about) 1 package bacalhau 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "sort" 8 "strings" 9 "time" 10 11 "github.com/filecoin-project/bacalhau/pkg/compute/capacity" 12 "github.com/filecoin-project/bacalhau/pkg/ipfs" 13 "github.com/filecoin-project/bacalhau/pkg/jobstore/inmemory" 14 "github.com/filecoin-project/bacalhau/pkg/libp2p" 15 "github.com/filecoin-project/bacalhau/pkg/libp2p/rcmgr" 16 "github.com/filecoin-project/bacalhau/pkg/logger" 17 "github.com/filecoin-project/bacalhau/pkg/model" 18 "github.com/filecoin-project/bacalhau/pkg/node" 19 filecoinlotus "github.com/filecoin-project/bacalhau/pkg/publisher/filecoin_lotus" 20 "github.com/filecoin-project/bacalhau/pkg/system" 21 "github.com/filecoin-project/bacalhau/pkg/util/templates" 22 "github.com/multiformats/go-multiaddr" 23 24 "github.com/rs/zerolog/log" 25 "github.com/spf13/cobra" 26 "k8s.io/kubectl/pkg/util/i18n" 27 ) 28 29 var DefaultSwarmPort = 1235 30 31 var ( 32 serveLong = templates.LongDesc(i18n.T(` 33 Start a bacalhau node. 34 `)) 35 36 serveExample = templates.Examples(i18n.T(` 37 # Start a bacalhau compute node 38 bacalhau serve 39 # or 40 bacalhau serve --node-type compute 41 42 # Start a bacalhau requester node 43 bacalhau serve --node-type requester 44 45 # Start a bacalhau hybrid node that acts as both compute and requester 46 bacalhau serve --node-type compute --node-type requester 47 # or 48 bacalhau serve --node-type compute,requester 49 `)) 50 ) 51 52 //nolint:lll // Documentation 53 type ServeOptions struct { 54 NodeType []string // "compute", "requester" node or both 55 PeerConnect string // The libp2p multiaddress to connect to. 56 IPFSConnect string // The multiaddress to connect to for IPFS. 57 FilecoinUnsealedPath string // Go template to turn a Filecoin CID into a local filepath with the unsealed data. 58 EstuaryAPIKey string // The API key used when using the estuary API. 59 HostAddress string // The host address to listen on. 60 SwarmPort int // The host port for libp2p network. 61 JobSelectionDataLocality string // The data locality to use for job selection. 62 JobSelectionDataRejectStateless bool // Whether to reject jobs that don't specify any data. 63 JobSelectionDataAcceptNetworked bool // Whether to accept jobs that require network access. 64 JobSelectionProbeHTTP string // The HTTP URL to use for job selection. 65 JobSelectionProbeExec string // The executable to use for job selection. 66 LimitTotalCPU string // The total amount of CPU the system can be using at one time. 67 LimitTotalMemory string // The total amount of memory the system can be using at one time. 68 LimitTotalGPU string // The total amount of GPU the system can be using at one time. 69 LimitJobCPU string // The amount of CPU the system can be using at one time for a single job. 70 LimitJobMemory string // The amount of memory the system can be using at one time for a single job. 71 LimitJobGPU string // The amount of GPU the system can be using at one time for a single job. 72 LotusFilecoinStorageDuration time.Duration // How long deals should be for the Lotus Filecoin publisher 73 LotusFilecoinPathDirectory string // The location of the Lotus configuration directory which contains config.toml, etc 74 LotusFilecoinUploadDirectory string // Directory to put files when uploading to Lotus (optional) 75 LotusFilecoinMaximumPing time.Duration // The maximum ping allowed when selecting a Filecoin miner 76 JobExecutionTimeoutClientIDBypassList []string // IDs of clients that can submit jobs more than the configured job execution timeout 77 Labels map[string]string // Labels to apply to the node that can be used for node selection and filtering 78 IPFSSwarmAddresses []string // IPFS multiaddresses that the in-process IPFS should connect to 79 PrivateInternalIPFS bool // Whether the in-process IPFS should automatically discover other IPFS nodes 80 } 81 82 func NewServeOptions() *ServeOptions { 83 return &ServeOptions{ 84 NodeType: []string{"compute"}, 85 PeerConnect: "", 86 IPFSConnect: "", 87 FilecoinUnsealedPath: "", 88 EstuaryAPIKey: os.Getenv("ESTUARY_API_KEY"), 89 HostAddress: "0.0.0.0", 90 SwarmPort: DefaultSwarmPort, 91 JobSelectionDataLocality: "local", 92 JobSelectionDataRejectStateless: false, 93 JobSelectionDataAcceptNetworked: false, 94 JobSelectionProbeHTTP: "", 95 JobSelectionProbeExec: "", 96 LimitTotalCPU: "", 97 LimitTotalMemory: "", 98 LimitTotalGPU: "", 99 LimitJobCPU: "", 100 LimitJobMemory: "", 101 LimitJobGPU: "", 102 LotusFilecoinPathDirectory: os.Getenv("LOTUS_PATH"), 103 LotusFilecoinMaximumPing: 2 * time.Second, 104 } 105 } 106 107 func setupJobSelectionCLIFlags(cmd *cobra.Command, OS *ServeOptions) { 108 cmd.PersistentFlags().StringVar( 109 &OS.JobSelectionDataLocality, "job-selection-data-locality", OS.JobSelectionDataLocality, 110 `Only accept jobs that reference data we have locally ("local") or anywhere ("anywhere").`, 111 ) 112 cmd.PersistentFlags().BoolVar( 113 &OS.JobSelectionDataRejectStateless, "job-selection-reject-stateless", OS.JobSelectionDataRejectStateless, 114 `Reject jobs that don't specify any data.`, 115 ) 116 cmd.PersistentFlags().BoolVar( 117 &OS.JobSelectionDataAcceptNetworked, "job-selection-accept-networked", OS.JobSelectionDataAcceptNetworked, 118 `Accept jobs that require network access.`, 119 ) 120 cmd.PersistentFlags().StringVar( 121 &OS.JobSelectionProbeHTTP, "job-selection-probe-http", OS.JobSelectionProbeHTTP, 122 `Use the result of a HTTP POST to decide if we should take on the job.`, 123 ) 124 cmd.PersistentFlags().StringVar( 125 &OS.JobSelectionProbeExec, "job-selection-probe-exec", OS.JobSelectionProbeExec, 126 `Use the result of a exec an external program to decide if we should take on the job.`, 127 ) 128 } 129 130 func setupCapacityManagerCLIFlags(cmd *cobra.Command, OS *ServeOptions) { 131 cmd.PersistentFlags().StringVar( 132 &OS.LimitTotalCPU, "limit-total-cpu", OS.LimitTotalCPU, 133 `Total CPU core limit to run all jobs (e.g. 500m, 2, 8).`, 134 ) 135 cmd.PersistentFlags().StringVar( 136 &OS.LimitTotalMemory, "limit-total-memory", OS.LimitTotalMemory, 137 `Total Memory limit to run all jobs (e.g. 500Mb, 2Gb, 8Gb).`, 138 ) 139 cmd.PersistentFlags().StringVar( 140 &OS.LimitTotalGPU, "limit-total-gpu", OS.LimitTotalGPU, 141 `Total GPU limit to run all jobs (e.g. 1, 2, or 8).`, 142 ) 143 cmd.PersistentFlags().StringVar( 144 &OS.LimitJobCPU, "limit-job-cpu", OS.LimitJobCPU, 145 `Job CPU core limit for single job (e.g. 500m, 2, 8).`, 146 ) 147 cmd.PersistentFlags().StringVar( 148 &OS.LimitJobMemory, "limit-job-memory", OS.LimitJobMemory, 149 `Job Memory limit for single job (e.g. 500Mb, 2Gb, 8Gb).`, 150 ) 151 cmd.PersistentFlags().StringVar( 152 &OS.LimitJobGPU, "limit-job-gpu", OS.LimitJobGPU, 153 `Job GPU limit for single job (e.g. 1, 2, or 8).`, 154 ) 155 cmd.PersistentFlags().StringSliceVar( 156 &OS.JobExecutionTimeoutClientIDBypassList, "job-execution-timeout-bypass-client-id", OS.JobExecutionTimeoutClientIDBypassList, 157 `List of IDs of clients that are allowed to bypass the job execution timeout check`, 158 ) 159 } 160 161 func setupLibp2pCLIFlags(cmd *cobra.Command, OS *ServeOptions) { 162 cmd.PersistentFlags().StringVar( 163 &OS.PeerConnect, "peer", OS.PeerConnect, 164 `The libp2p multiaddress to connect to.`, 165 ) 166 cmd.PersistentFlags().StringVar( 167 &OS.HostAddress, "host", OS.HostAddress, 168 `The host to listen on (for both api and swarm connections).`, 169 ) 170 cmd.PersistentFlags().IntVar( 171 &OS.SwarmPort, "swarm-port", OS.SwarmPort, 172 `The port to listen on for swarm connections.`, 173 ) 174 } 175 176 func getPeers(OS *ServeOptions) ([]multiaddr.Multiaddr, error) { 177 var peersStrings []string 178 if OS.PeerConnect == "none" { 179 peersStrings = []string{} 180 } else if OS.PeerConnect == "" { 181 peersStrings = system.Envs[system.GetEnvironment()].BootstrapAddresses 182 } else { 183 peersStrings = strings.Split(OS.PeerConnect, ",") 184 } 185 186 peers := make([]multiaddr.Multiaddr, 0, len(peersStrings)) 187 for _, peer := range peersStrings { 188 parsed, err := multiaddr.NewMultiaddr(peer) 189 if err != nil { 190 return nil, err 191 } 192 peers = append(peers, parsed) 193 } 194 return peers, nil 195 } 196 197 func getJobSelectionConfig(OS *ServeOptions) model.JobSelectionPolicy { 198 // construct the job selection policy from the CLI args 199 typedJobSelectionDataLocality := model.Anywhere 200 201 if OS.JobSelectionDataLocality == "anywhere" { 202 typedJobSelectionDataLocality = model.Anywhere 203 } 204 205 jobSelectionPolicy := model.JobSelectionPolicy{ 206 Locality: typedJobSelectionDataLocality, 207 RejectStatelessJobs: OS.JobSelectionDataRejectStateless, 208 AcceptNetworkedJobs: OS.JobSelectionDataAcceptNetworked, 209 ProbeHTTP: OS.JobSelectionProbeHTTP, 210 ProbeExec: OS.JobSelectionProbeExec, 211 } 212 213 return jobSelectionPolicy 214 } 215 216 func getComputeConfig(OS *ServeOptions) node.ComputeConfig { 217 return node.NewComputeConfigWith(node.ComputeConfigParams{ 218 JobSelectionPolicy: getJobSelectionConfig(OS), 219 TotalResourceLimits: capacity.ParseResourceUsageConfig(model.ResourceUsageConfig{ 220 CPU: OS.LimitTotalCPU, 221 Memory: OS.LimitTotalMemory, 222 GPU: OS.LimitTotalGPU, 223 }), 224 JobResourceLimits: capacity.ParseResourceUsageConfig(model.ResourceUsageConfig{ 225 CPU: OS.LimitJobCPU, 226 Memory: OS.LimitJobMemory, 227 GPU: OS.LimitJobGPU, 228 }), 229 IgnorePhysicalResourceLimits: os.Getenv("BACALHAU_CAPACITY_MANAGER_OVER_COMMIT") != "", 230 JobExecutionTimeoutClientIDBypassList: OS.JobExecutionTimeoutClientIDBypassList, 231 }) 232 } 233 234 func newServeCmd() *cobra.Command { 235 OS := NewServeOptions() 236 237 serveCmd := &cobra.Command{ 238 Use: "serve", 239 Short: "Start the bacalhau compute node", 240 Long: serveLong, 241 Example: serveExample, 242 RunE: func(cmd *cobra.Command, _ []string) error { 243 return serve(cmd, OS) 244 }, 245 } 246 247 serveCmd.PersistentFlags().StringSliceVar( 248 &OS.NodeType, "node-type", OS.NodeType, 249 `Whether the node is a compute, requester or both.`, 250 ) 251 252 serveCmd.PersistentFlags().StringToStringVar( 253 &OS.Labels, "labels", OS.Labels, 254 `Labels to be associated with the node that can be used for node selection and filtering. (e.g. --labels key1=value1,key2=value2)`, 255 ) 256 257 serveCmd.PersistentFlags().StringVar( 258 &OS.IPFSConnect, "ipfs-connect", OS.IPFSConnect, 259 `The ipfs host multiaddress to connect to, otherwise an in-process IPFS node will be created if not set.`, 260 ) 261 serveCmd.PersistentFlags().StringVar( 262 &OS.FilecoinUnsealedPath, "filecoin-unsealed-path", OS.FilecoinUnsealedPath, 263 `The go template that can turn a filecoin CID into a local filepath with the unsealed data.`, 264 ) 265 serveCmd.PersistentFlags().StringVar( 266 &OS.EstuaryAPIKey, "estuary-api-key", OS.EstuaryAPIKey, 267 `The API key used when using the estuary API.`, 268 ) 269 serveCmd.PersistentFlags().DurationVar( 270 &OS.LotusFilecoinStorageDuration, "lotus-storage-duration", OS.LotusFilecoinStorageDuration, 271 "Duration to store data in Lotus Filecoin for.", 272 ) 273 serveCmd.PersistentFlags().StringVar( 274 &OS.LotusFilecoinPathDirectory, "lotus-path-directory", OS.LotusFilecoinPathDirectory, 275 "Location of the Lotus Filecoin configuration directory.", 276 ) 277 serveCmd.PersistentFlags().StringVar( 278 &OS.LotusFilecoinUploadDirectory, "lotus-upload-directory", OS.LotusFilecoinUploadDirectory, 279 "Directory to use when uploading content to Lotus Filecoin.", 280 ) 281 serveCmd.PersistentFlags().DurationVar( 282 &OS.LotusFilecoinMaximumPing, "lotus-max-ping", OS.LotusFilecoinMaximumPing, 283 "The highest ping a Filecoin miner could have when selecting.", 284 ) 285 serveCmd.PersistentFlags().StringSliceVar( 286 &OS.IPFSSwarmAddresses, "ipfs-swarm-addr", OS.IPFSSwarmAddresses, 287 "IPFS multiaddress to connect the in-process IPFS node to - cannot be used with --ipfs-connect.", 288 ) 289 serveCmd.PersistentFlags().BoolVar( 290 &OS.PrivateInternalIPFS, "private-internal-ipfs", OS.PrivateInternalIPFS, 291 "Whether the in-process IPFS node should auto-discover other nodes, including the public IPFS network - "+ 292 "cannot be used with --ipfs-connect.", 293 ) 294 295 setupLibp2pCLIFlags(serveCmd, OS) 296 setupJobSelectionCLIFlags(serveCmd, OS) 297 setupCapacityManagerCLIFlags(serveCmd, OS) 298 299 return serveCmd 300 } 301 302 //nolint:funlen,gocyclo 303 func serve(cmd *cobra.Command, OS *ServeOptions) error { 304 ctx := cmd.Context() 305 cm := ctx.Value(systemManagerKey).(*system.CleanupManager) 306 307 isComputeNode, isRequesterNode := false, false 308 for _, nodeType := range OS.NodeType { 309 if nodeType == "compute" { 310 isComputeNode = true 311 } else if nodeType == "requester" { 312 isRequesterNode = true 313 } else { 314 return fmt.Errorf("invalid node type %s. Only compute and requester values are supported", nodeType) 315 } 316 } 317 318 if OS.JobSelectionDataLocality != "local" && OS.JobSelectionDataLocality != "anywhere" { 319 return fmt.Errorf("--job-selection-data-locality must be either 'local' or 'anywhere'") 320 } 321 322 if OS.IPFSConnect != "" && OS.PrivateInternalIPFS { 323 return fmt.Errorf("--private-internal-ipfs cannot be used with --ipfs-connect") 324 } 325 326 if OS.IPFSConnect != "" && len(OS.IPFSSwarmAddresses) != 0 { 327 return fmt.Errorf("--ipfs-swarm-addr cannot be used with --ipfs-connect") 328 } 329 330 // Establishing p2p connection 331 peers, err := getPeers(OS) 332 if err != nil { 333 return err 334 } 335 log.Ctx(ctx).Debug().Msgf("libp2p connecting to: %s", peers) 336 337 libp2pHost, err := libp2p.NewHost(OS.SwarmPort, rcmgr.DefaultResourceManager) 338 if err != nil { 339 Fatal(cmd, fmt.Sprintf("Error creating libp2p host: %s", err), 1) 340 } 341 cm.RegisterCallback(libp2pHost.Close) 342 343 // add nodeID to logging context 344 ctx = logger.ContextWithNodeIDLogger(ctx, libp2pHost.ID().String()) 345 346 // Establishing IPFS connection 347 ipfsClient, err := ipfsClient(ctx, OS, cm) 348 if err != nil { 349 return err 350 } 351 352 datastore := inmemory.NewJobStore() 353 if err != nil { 354 return fmt.Errorf("error creating in memory datastore: %s", err) 355 } 356 357 // Create node config from cmd arguments 358 nodeConfig := node.NodeConfig{ 359 IPFSClient: ipfsClient, 360 CleanupManager: cm, 361 JobStore: datastore, 362 Host: libp2pHost, 363 FilecoinUnsealedPath: OS.FilecoinUnsealedPath, 364 EstuaryAPIKey: OS.EstuaryAPIKey, 365 HostAddress: OS.HostAddress, 366 APIPort: apiPort, 367 ComputeConfig: getComputeConfig(OS), 368 RequesterNodeConfig: node.NewRequesterConfigWithDefaults(), 369 IsComputeNode: isComputeNode, 370 IsRequesterNode: isRequesterNode, 371 Labels: OS.Labels, 372 } 373 374 if OS.LotusFilecoinStorageDuration != time.Duration(0) && 375 OS.LotusFilecoinPathDirectory != "" && 376 OS.LotusFilecoinMaximumPing != time.Duration(0) { 377 nodeConfig.LotusConfig = &filecoinlotus.PublisherConfig{ 378 StorageDuration: OS.LotusFilecoinStorageDuration, 379 PathDir: OS.LotusFilecoinPathDirectory, 380 UploadDir: OS.LotusFilecoinUploadDirectory, 381 MaximumPing: OS.LotusFilecoinMaximumPing, 382 } 383 } 384 385 // Create node 386 standardNode, err := node.NewStandardNode(ctx, nodeConfig) 387 if err != nil { 388 return fmt.Errorf("error creating node: %s", err) 389 } 390 391 // Start transport layer 392 err = libp2p.ConnectToPeersContinuously(ctx, cm, libp2pHost, peers) 393 if err != nil { 394 return err 395 } 396 397 // Start node 398 err = standardNode.Start(ctx) 399 if err != nil { 400 return fmt.Errorf("error starting node: %s", err) 401 } 402 403 if OS.PrivateInternalIPFS && OS.PeerConnect == "none" { 404 nodeType := "" 405 if !isRequesterNode { 406 nodeType = "--node-type requester " 407 } 408 409 ipfsAddresses, err := ipfsClient.SwarmMultiAddresses(ctx) 410 if err != nil { 411 return fmt.Errorf("error looking up IPFS addresses: %s", err) 412 } 413 414 p2pAddr, err := multiaddr.NewMultiaddr("/p2p/" + libp2pHost.ID().String()) 415 if err != nil { 416 return err 417 } 418 419 peerAddress := pickP2pAddress(libp2pHost.Addrs()).Encapsulate(p2pAddr).String() 420 ipfsSwarmAddress := pickP2pAddress(ipfsAddresses).String() 421 422 cmd.Println() 423 cmd.Println("To connect another node to this private one, run the following command in your shell:") 424 cmd.Printf( 425 "%s serve %s--private-internal-ipfs --peer %s --ipfs-swarm-addr %s\n", 426 os.Args[0], nodeType, peerAddress, ipfsSwarmAddress, 427 ) 428 429 if isRequesterNode { 430 cmd.Println() 431 cmd.Println("To use this requester node from the client, run the following commands in your shell:") 432 cmd.Printf("export BACALHAU_IPFS_SWARM_ADDRESSES=%s\n", ipfsSwarmAddress) 433 cmd.Printf("export BACALHAU_API_HOST=%s\n", OS.HostAddress) 434 cmd.Printf("export BACALHAU_API_PORT=%d\n", apiPort) 435 } 436 } 437 438 <-ctx.Done() // block until killed 439 return nil 440 } 441 442 // pickP2pAddress will aim to select a non-localhost IPv4 TCP address, or at least a non-localhost IPv6 one, from a list 443 // of addresses. 444 func pickP2pAddress(addresses []multiaddr.Multiaddr) multiaddr.Multiaddr { 445 value := func(m multiaddr.Multiaddr) int { 446 count := 0 447 if _, err := m.ValueForProtocol(multiaddr.P_TCP); err == nil { 448 count++ 449 } 450 if ip, err := m.ValueForProtocol(multiaddr.P_IP4); err == nil { 451 count++ 452 if ip != "127.0.0.1" { 453 count++ 454 } 455 } else if ip, err := m.ValueForProtocol(multiaddr.P_IP6); err == nil && ip != "::1" { 456 count++ 457 } 458 return count 459 } 460 sort.Slice(addresses, func(i, j int) bool { 461 return value(addresses[i]) > value(addresses[j]) 462 }) 463 464 return addresses[0] 465 } 466 467 func ipfsClient(ctx context.Context, OS *ServeOptions, cm *system.CleanupManager) (ipfs.Client, error) { 468 if OS.IPFSConnect == "" { 469 // Connect to the public IPFS nodes by default 470 newNode := ipfs.NewNode 471 if OS.PrivateInternalIPFS { 472 newNode = ipfs.NewLocalNode 473 } 474 475 ipfsNode, err := newNode(ctx, cm, OS.IPFSSwarmAddresses) 476 if err != nil { 477 return ipfs.Client{}, fmt.Errorf("error creating IPFS node: %s", err) 478 } 479 cm.RegisterCallbackWithContext(ipfsNode.Close) 480 client := ipfsNode.Client() 481 482 swarmAddresses, err := client.SwarmAddresses(ctx) 483 if err != nil { 484 return ipfs.Client{}, fmt.Errorf("error looking up IPFS addresses: %s", err) 485 } 486 487 log.Ctx(ctx).Info().Strs("ipfs_swarm_addresses", swarmAddresses).Msg("Internal IPFS node available") 488 return client, nil 489 } 490 491 client, err := ipfs.NewClientUsingRemoteHandler(ctx, OS.IPFSConnect) 492 if err != nil { 493 return ipfs.Client{}, fmt.Errorf("error creating IPFS client: %s", err) 494 } 495 496 return client, nil 497 }