github.com/apache/beam/sdks/v2@v2.48.2/python/container/boot.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // boot is the boot code for the Python SDK harness container. It is responsible
    17  // for retrieving and install staged files and invoking python correctly.
    18  package main
    19  
    20  import (
    21  	"context"
    22  	"encoding/json"
    23  	"errors"
    24  	"flag"
    25  	"fmt"
    26  	"log"
    27  	"os"
    28  	"os/exec"
    29  	"os/signal"
    30  	"path/filepath"
    31  	"regexp"
    32  	"strings"
    33  	"sync"
    34  	"syscall"
    35  	"time"
    36  
    37  	"github.com/apache/beam/sdks/v2/go/container/tools"
    38  	"github.com/apache/beam/sdks/v2/go/pkg/beam/artifact"
    39  	pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1"
    40  	"github.com/apache/beam/sdks/v2/go/pkg/beam/util/execx"
    41  	"github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx"
    42  	"github.com/golang/protobuf/jsonpb"
    43  	"github.com/golang/protobuf/proto"
    44  )
    45  
    46  var (
    47  	acceptableWhlSpecs []string
    48  
    49  	// SetupOnly option is used to invoke the boot sequence to only process the provided artifacts and builds new dependency pre-cached images.
    50  	setupOnly = flag.Bool("setup_only", false, "Execute boot program in setup only mode (optional).")
    51  	artifacts = flag.String("artifacts", "", "Path to artifacts metadata file used in setup only mode (optional).")
    52  
    53  	// Contract: https://s.apache.org/beam-fn-api-container-contract.
    54  
    55  	workerPool        = flag.Bool("worker_pool", false, "Run as worker pool (optional).")
    56  	id                = flag.String("id", "", "Local identifier (required).")
    57  	loggingEndpoint   = flag.String("logging_endpoint", "", "Logging endpoint (required).")
    58  	artifactEndpoint  = flag.String("artifact_endpoint", "", "Artifact endpoint (required).")
    59  	provisionEndpoint = flag.String("provision_endpoint", "", "Provision endpoint (required).")
    60  	controlEndpoint   = flag.String("control_endpoint", "", "Control endpoint (required).")
    61  	semiPersistDir    = flag.String("semi_persist_dir", "/tmp", "Local semi-persistent directory (optional).")
    62  )
    63  
    64  const (
    65  	sdkHarnessEntrypoint = "apache_beam.runners.worker.sdk_worker_main"
    66  	// Please keep these names in sync with stager.py
    67  	workflowFile      = "workflow.tar.gz"
    68  	requirementsFile  = "requirements.txt"
    69  	sdkSrcFile        = "dataflow_python_sdk.tar"
    70  	extraPackagesFile = "extra_packages.txt"
    71  	workerPoolIdEnv   = "BEAM_PYTHON_WORKER_POOL_ID"
    72  
    73  	standardArtifactFileTypeUrn = "beam:artifact:type:file:v1"
    74  )
    75  
    76  func main() {
    77  	flag.Parse()
    78  
    79  	if *setupOnly {
    80  		processArtifactsInSetupOnlyMode()
    81  		os.Exit(0)
    82  	}
    83  
    84  	if *workerPool {
    85  		workerPoolId := fmt.Sprintf("%d", os.Getpid())
    86  		os.Setenv(workerPoolIdEnv, workerPoolId)
    87  		args := []string{
    88  			"-m",
    89  			"apache_beam.runners.worker.worker_pool_main",
    90  			"--service_port=50000",
    91  			"--container_executable=/opt/apache/beam/boot",
    92  		}
    93  		log.Printf("Starting worker pool %v: python %v", workerPoolId, strings.Join(args, " "))
    94  		if err := execx.Execute("python", args...); err != nil {
    95  			log.Fatalf("Python SDK worker pool exited with error: %v", err)
    96  		}
    97  		log.Print("Python SDK worker pool exited.")
    98  		os.Exit(0)
    99  	}
   100  
   101  	if *id == "" {
   102  		log.Fatalf("No id provided.")
   103  	}
   104  	if *provisionEndpoint == "" {
   105  		log.Fatalf("No provision endpoint provided.")
   106  	}
   107  
   108  	if err := launchSDKProcess(); err != nil {
   109  		log.Fatal(err)
   110  	}
   111  }
   112  
   113  func launchSDKProcess() error {
   114  	ctx := grpcx.WriteWorkerID(context.Background(), *id)
   115  
   116  	info, err := tools.ProvisionInfo(ctx, *provisionEndpoint)
   117  	if err != nil {
   118  		log.Fatalf("Failed to obtain provisioning information: %v", err)
   119  	}
   120  	log.Printf("Provision info:\n%v", info)
   121  
   122  	// TODO(BEAM-8201): Simplify once flags are no longer used.
   123  	if info.GetLoggingEndpoint().GetUrl() != "" {
   124  		*loggingEndpoint = info.GetLoggingEndpoint().GetUrl()
   125  	}
   126  	if info.GetArtifactEndpoint().GetUrl() != "" {
   127  		*artifactEndpoint = info.GetArtifactEndpoint().GetUrl()
   128  	}
   129  	if info.GetControlEndpoint().GetUrl() != "" {
   130  		*controlEndpoint = info.GetControlEndpoint().GetUrl()
   131  	}
   132  
   133  	if *loggingEndpoint == "" {
   134  		log.Fatalf("No logging endpoint provided.")
   135  	}
   136  	if *artifactEndpoint == "" {
   137  		log.Fatalf("No artifact endpoint provided.")
   138  	}
   139  	if *controlEndpoint == "" {
   140  		log.Fatalf("No control endpoint provided.")
   141  	}
   142  	logger := &tools.Logger{Endpoint: *loggingEndpoint}
   143  	logger.Printf(ctx, "Initializing python harness: %v", strings.Join(os.Args, " "))
   144  
   145  	// (1) Obtain the pipeline options
   146  
   147  	options, err := tools.ProtoToJSON(info.GetPipelineOptions())
   148  	if err != nil {
   149  		logger.Fatalf(ctx, "Failed to convert pipeline options: %v", err)
   150  	}
   151  
   152  	// (2) Retrieve and install the staged packages.
   153  	//
   154  	// No log.Fatalf() from here on, otherwise deferred cleanups will not be called!
   155  
   156  	// Trap signals, so we can clean up properly.
   157  	signalChannel := make(chan os.Signal, 1)
   158  	signal.Notify(signalChannel, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM)
   159  
   160  	// Create a separate virtual environment (with access to globally installed packages), unless disabled by the user.
   161  	// This improves usability on runners that persist the execution environment for the boot entrypoint between multiple pipeline executions.
   162  	if os.Getenv("RUN_PYTHON_SDK_IN_DEFAULT_ENVIRONMENT") == "" {
   163  		venvDir, err := setupVenv(ctx, logger, "/opt/apache/beam-venv", *id)
   164  		if err != nil {
   165  			return errors.New(
   166  				"failed to create a virtual environment. If running on Ubuntu systems, " +
   167  				"you might need to install `python3-venv` package. " +
   168  				"To run the SDK process in default environment instead, " +
   169  				"set the environment variable `RUN_PYTHON_SDK_IN_DEFAULT_ENVIRONMENT=1`. " +
   170  				"In custom Docker images, you can do that with an `ENV` statement. " +
   171  				fmt.Sprintf("Encountered error: %v", err))
   172  		}
   173  		cleanupFunc := func() {
   174  			os.RemoveAll(venvDir)
   175  			logger.Printf(ctx, "Cleaned up temporary venv for worker %v.", *id)
   176  		}
   177  		defer cleanupFunc()
   178  	}
   179  
   180  	dir := filepath.Join(*semiPersistDir, "staged")
   181  	files, err := artifact.Materialize(ctx, *artifactEndpoint, info.GetDependencies(), info.GetRetrievalToken(), dir)
   182  	if err != nil {
   183  		return fmt.Errorf("failed to retrieve staged files: %v", err)
   184  	}
   185  
   186  	// TODO(herohde): the packages to install should be specified explicitly. It
   187  	// would also be possible to install the SDK in the Dockerfile.
   188  	fileNames := make([]string, len(files))
   189  	requirementsFiles := []string{requirementsFile}
   190  	for i, v := range files {
   191  		name, _ := artifact.MustExtractFilePayload(v)
   192  		logger.Printf(ctx, "Found artifact: %s", name)
   193  		fileNames[i] = name
   194  
   195  		if v.RoleUrn == artifact.URNPipRequirementsFile {
   196  			requirementsFiles = append(requirementsFiles, name)
   197  		}
   198  	}
   199  
   200  	if setupErr := installSetupPackages(fileNames, dir, requirementsFiles); setupErr != nil {
   201  		return fmt.Errorf("failed to install required packages: %v", setupErr)
   202  	}
   203  
   204  	// (3) Invoke python
   205  
   206  	os.Setenv("PIPELINE_OPTIONS", options)
   207  	os.Setenv("SEMI_PERSISTENT_DIRECTORY", *semiPersistDir)
   208  	os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint}))
   209  	os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *controlEndpoint}))
   210  	os.Setenv("RUNNER_CAPABILITIES", strings.Join(info.GetRunnerCapabilities(), " "))
   211  
   212  	if info.GetStatusEndpoint() != nil {
   213  		os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(info.GetStatusEndpoint()))
   214  	}
   215  
   216  	if metadata := info.GetMetadata(); metadata != nil {
   217  		if jobName, nameExists := metadata["job_name"]; nameExists {
   218  			os.Setenv("JOB_NAME", jobName)
   219  		}
   220  		if jobID, idExists := metadata["job_id"]; idExists {
   221  			os.Setenv("JOB_ID", jobID)
   222  		}
   223  	}
   224  
   225  	workerIds := append([]string{*id}, info.GetSiblingWorkerIds()...)
   226  
   227  	// Keep track of child PIDs for clean shutdown without zombies
   228  	childPids := struct {
   229  		v        []int
   230  		canceled bool
   231  		mu       sync.Mutex
   232  	}{v: make([]int, 0, len(workerIds))}
   233  
   234  	// Forward trapped signals to child process groups in order to terminate them gracefully and avoid zombies
   235  	go func() {
   236  		logger.Printf(ctx, "Received signal: %v", <-signalChannel)
   237  		childPids.mu.Lock()
   238  		childPids.canceled = true
   239  		for _, pid := range childPids.v {
   240  			go func(pid int) {
   241  				// This goroutine will be canceled if the main process exits before the 5 seconds
   242  				// have elapsed, i.e., as soon as all subprocesses have returned from Wait().
   243  				time.Sleep(5 * time.Second)
   244  				if err := syscall.Kill(-pid, syscall.SIGKILL); err == nil {
   245  					logger.Printf(ctx, "Worker process %v did not respond, killed it.", pid)
   246  				}
   247  			}(pid)
   248  			syscall.Kill(-pid, syscall.SIGTERM)
   249  		}
   250  		childPids.mu.Unlock()
   251  	}()
   252  
   253  	args := []string{
   254  		"-m",
   255  		sdkHarnessEntrypoint,
   256  	}
   257  
   258  	var wg sync.WaitGroup
   259  	wg.Add(len(workerIds))
   260  	for _, workerId := range workerIds {
   261  		go func(workerId string) {
   262  			defer wg.Done()
   263  
   264  			errorCount := 0
   265  			for {
   266  				childPids.mu.Lock()
   267  				if childPids.canceled {
   268  					childPids.mu.Unlock()
   269  					return
   270  				}
   271  				logger.Printf(ctx, "Executing Python (worker %v): python %v", workerId, strings.Join(args, " "))
   272  				cmd := StartCommandEnv(map[string]string{"WORKER_ID": workerId}, "python", args...)
   273  				childPids.v = append(childPids.v, cmd.Process.Pid)
   274  				childPids.mu.Unlock()
   275  
   276  				if err := cmd.Wait(); err != nil {
   277  					// Retry on fatal errors, like OOMs and segfaults, not just
   278  					// DoFns throwing exceptions.
   279  					errorCount += 1
   280  					if errorCount < 4 {
   281  						logger.Printf(ctx, "Python (worker %v) exited %v times: %v\nrestarting SDK process",
   282  							workerId, errorCount, err)
   283  					} else {
   284  						logger.Fatalf(ctx, "Python (worker %v) exited %v times: %v\nout of retries, failing container",
   285  							workerId, errorCount, err)
   286  					}
   287  				} else {
   288  					logger.Printf(ctx, "Python (worker %v) exited.", workerId)
   289  					break
   290  				}
   291  			}
   292  		}(workerId)
   293  	}
   294  	wg.Wait()
   295  	return nil
   296  }
   297  
   298  // Start a command object in a new process group with the given arguments with
   299  // additional environment variables. It attaches stdio to the child process.
   300  // Returns the process handle.
   301  func StartCommandEnv(env map[string]string, prog string, args ...string) *exec.Cmd {
   302  	cmd := exec.Command(prog, args...)
   303  	cmd.Stdin = os.Stdin
   304  	cmd.Stdout = os.Stdout
   305  	cmd.Stderr = os.Stderr
   306  	if env != nil {
   307  		cmd.Env = os.Environ()
   308  		for k, v := range env {
   309  			cmd.Env = append(cmd.Env, k+"="+v)
   310  		}
   311  	}
   312  
   313  	// Create process group so we can clean up the whole subtree later without creating zombies
   314  	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true, Pgid: 0}
   315  	cmd.Start()
   316  	return cmd
   317  }
   318  
   319  // setupVenv initializes a local Python venv and sets the corresponding env variables
   320  func setupVenv(ctx context.Context, logger *tools.Logger, baseDir, workerId string) (string, error) {
   321  	dir := filepath.Join(baseDir, "beam-venv-worker-"+workerId)
   322  	logger.Printf(ctx, "Initializing temporary Python venv in %v", dir)
   323  	if _, err := os.Stat(dir); !os.IsNotExist(err) {
   324  		// Probably leftovers from a previous run
   325  		logger.Printf(ctx, "Cleaning up previous venv ...")
   326  		if err := os.RemoveAll(dir); err != nil {
   327  			return "", err
   328  		}
   329  	}
   330  	if err := os.MkdirAll(dir, 0750); err != nil {
   331  		return "", fmt.Errorf("failed to create Python venv directory: %s", err)
   332  	}
   333  	if err := execx.Execute("python", "-m", "venv", "--system-site-packages", dir); err != nil {
   334  		return "", fmt.Errorf("python venv initialization failed: %s", err)
   335  	}
   336  
   337  	os.Setenv("VIRTUAL_ENV", dir)
   338  	os.Setenv("PATH", strings.Join([]string{filepath.Join(dir, "bin"), os.Getenv("PATH")}, ":"))
   339  	return dir, nil
   340  }
   341  
   342  // setupAcceptableWheelSpecs setup wheel specs according to installed python version
   343  func setupAcceptableWheelSpecs() error {
   344  	cmd := exec.Command("python", "-V")
   345  	stdoutStderr, err := cmd.CombinedOutput()
   346  	if err != nil {
   347  		return err
   348  	}
   349  	re := regexp.MustCompile(`Python (\d)\.(\d+).*`)
   350  	pyVersions := re.FindStringSubmatch(string(stdoutStderr[:]))
   351  	if len(pyVersions) != 3 {
   352  		return fmt.Errorf("cannot get parse Python version from %s", stdoutStderr)
   353  	}
   354  	pyVersion := fmt.Sprintf("%s%s", pyVersions[1], pyVersions[2])
   355  	var wheelName string
   356  	switch pyVersion {
   357  	case "36", "37":
   358  		wheelName = fmt.Sprintf("cp%s-cp%sm-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", pyVersion, pyVersion)
   359  	default:
   360  		wheelName = fmt.Sprintf("cp%s-cp%s-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", pyVersion, pyVersion)
   361  	}
   362  	acceptableWhlSpecs = append(acceptableWhlSpecs, wheelName)
   363  	return nil
   364  }
   365  
   366  // installSetupPackages installs Beam SDK and user dependencies.
   367  func installSetupPackages(files []string, workDir string, requirementsFiles []string) error {
   368  	log.Printf("Installing setup packages ...")
   369  
   370  	if err := setupAcceptableWheelSpecs(); err != nil {
   371  		log.Printf("Failed to setup acceptable wheel specs, leave it as empty: %v", err)
   372  	}
   373  
   374  	// Install the Dataflow Python SDK and worker packages.
   375  	// We install the extra requirements in case of using the beam sdk. These are ignored by pip
   376  	// if the user is using an SDK that does not provide these.
   377  	if err := installSdk(files, workDir, sdkSrcFile, acceptableWhlSpecs, false); err != nil {
   378  		return fmt.Errorf("failed to install SDK: %v", err)
   379  	}
   380  	// The staged files will not disappear due to restarts because workDir is a
   381  	// folder that is mapped to the host (and therefore survives restarts).
   382  	for _, f := range requirementsFiles {
   383  		if err := pipInstallRequirements(files, workDir, f); err != nil {
   384  			return fmt.Errorf("failed to install requirements: %v", err)
   385  		}
   386  	}
   387  	if err := installExtraPackages(files, extraPackagesFile, workDir); err != nil {
   388  		return fmt.Errorf("failed to install extra packages: %v", err)
   389  	}
   390  	if err := pipInstallPackage(files, workDir, workflowFile, false, true, nil); err != nil {
   391  		return fmt.Errorf("failed to install workflow: %v", err)
   392  	}
   393  
   394  	return nil
   395  }
   396  
   397  // processArtifactsInSetupOnlyMode installs the dependencies found in artifacts
   398  // when flag --setup_only and --artifacts exist. The setup mode will only
   399  // process the provided artifacts and skip the actual worker program start up.
   400  // The mode is useful for building new images with dependencies pre-installed so
   401  // that the installation can be skipped at the pipeline runtime.
   402  func processArtifactsInSetupOnlyMode() {
   403  	if *artifacts == "" {
   404  		log.Fatal("No --artifacts provided along with --setup_only flag.")
   405  	}
   406  	workDir := filepath.Dir(*artifacts)
   407  	metadata, err := os.ReadFile(*artifacts)
   408  	if err != nil {
   409  		log.Fatalf("Unable to open artifacts metadata file %v with error %v", *artifacts, err)
   410  	}
   411  	var infoJsons []string
   412  	if err := json.Unmarshal(metadata, &infoJsons); err != nil {
   413  		log.Fatalf("Unable to parse metadata, error: %v", err)
   414  	}
   415  
   416  	files := make([]string, len(infoJsons))
   417  	for i, info := range infoJsons {
   418  		var artifactInformation pipepb.ArtifactInformation
   419  		if err := jsonpb.UnmarshalString(info, &artifactInformation); err != nil {
   420  			log.Fatalf("Unable to unmarshal artifact information from json string %v", info)
   421  		}
   422  
   423  		// For now we only expect artifacts in file type. The condition should be revisited if the assumption is not valid any more.
   424  		if artifactInformation.GetTypeUrn() != standardArtifactFileTypeUrn {
   425  			log.Fatalf("Expect file artifact type in setup only mode, found %v.", artifactInformation.GetTypeUrn())
   426  		}
   427  		filePayload := &pipepb.ArtifactFilePayload{}
   428  		if err := proto.Unmarshal(artifactInformation.GetTypePayload(), filePayload); err != nil {
   429  			log.Fatal("Unable to unmarshal artifact information type payload.")
   430  		}
   431  		files[i] = filePayload.GetPath()
   432  	}
   433  	if setupErr := installSetupPackages(files, workDir, []string{requirementsFile}); setupErr != nil {
   434  		log.Fatalf("Failed to install required packages: %v", setupErr)
   435  	}
   436  }