github.com/apache/beam/sdks/v2@v2.48.2/python/container/boot.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // boot is the boot code for the Python SDK harness container. It is responsible 17 // for retrieving and install staged files and invoking python correctly. 18 package main 19 20 import ( 21 "context" 22 "encoding/json" 23 "errors" 24 "flag" 25 "fmt" 26 "log" 27 "os" 28 "os/exec" 29 "os/signal" 30 "path/filepath" 31 "regexp" 32 "strings" 33 "sync" 34 "syscall" 35 "time" 36 37 "github.com/apache/beam/sdks/v2/go/container/tools" 38 "github.com/apache/beam/sdks/v2/go/pkg/beam/artifact" 39 pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" 40 "github.com/apache/beam/sdks/v2/go/pkg/beam/util/execx" 41 "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" 42 "github.com/golang/protobuf/jsonpb" 43 "github.com/golang/protobuf/proto" 44 ) 45 46 var ( 47 acceptableWhlSpecs []string 48 49 // SetupOnly option is used to invoke the boot sequence to only process the provided artifacts and builds new dependency pre-cached images. 50 setupOnly = flag.Bool("setup_only", false, "Execute boot program in setup only mode (optional).") 51 artifacts = flag.String("artifacts", "", "Path to artifacts metadata file used in setup only mode (optional).") 52 53 // Contract: https://s.apache.org/beam-fn-api-container-contract. 54 55 workerPool = flag.Bool("worker_pool", false, "Run as worker pool (optional).") 56 id = flag.String("id", "", "Local identifier (required).") 57 loggingEndpoint = flag.String("logging_endpoint", "", "Logging endpoint (required).") 58 artifactEndpoint = flag.String("artifact_endpoint", "", "Artifact endpoint (required).") 59 provisionEndpoint = flag.String("provision_endpoint", "", "Provision endpoint (required).") 60 controlEndpoint = flag.String("control_endpoint", "", "Control endpoint (required).") 61 semiPersistDir = flag.String("semi_persist_dir", "/tmp", "Local semi-persistent directory (optional).") 62 ) 63 64 const ( 65 sdkHarnessEntrypoint = "apache_beam.runners.worker.sdk_worker_main" 66 // Please keep these names in sync with stager.py 67 workflowFile = "workflow.tar.gz" 68 requirementsFile = "requirements.txt" 69 sdkSrcFile = "dataflow_python_sdk.tar" 70 extraPackagesFile = "extra_packages.txt" 71 workerPoolIdEnv = "BEAM_PYTHON_WORKER_POOL_ID" 72 73 standardArtifactFileTypeUrn = "beam:artifact:type:file:v1" 74 ) 75 76 func main() { 77 flag.Parse() 78 79 if *setupOnly { 80 processArtifactsInSetupOnlyMode() 81 os.Exit(0) 82 } 83 84 if *workerPool { 85 workerPoolId := fmt.Sprintf("%d", os.Getpid()) 86 os.Setenv(workerPoolIdEnv, workerPoolId) 87 args := []string{ 88 "-m", 89 "apache_beam.runners.worker.worker_pool_main", 90 "--service_port=50000", 91 "--container_executable=/opt/apache/beam/boot", 92 } 93 log.Printf("Starting worker pool %v: python %v", workerPoolId, strings.Join(args, " ")) 94 if err := execx.Execute("python", args...); err != nil { 95 log.Fatalf("Python SDK worker pool exited with error: %v", err) 96 } 97 log.Print("Python SDK worker pool exited.") 98 os.Exit(0) 99 } 100 101 if *id == "" { 102 log.Fatalf("No id provided.") 103 } 104 if *provisionEndpoint == "" { 105 log.Fatalf("No provision endpoint provided.") 106 } 107 108 if err := launchSDKProcess(); err != nil { 109 log.Fatal(err) 110 } 111 } 112 113 func launchSDKProcess() error { 114 ctx := grpcx.WriteWorkerID(context.Background(), *id) 115 116 info, err := tools.ProvisionInfo(ctx, *provisionEndpoint) 117 if err != nil { 118 log.Fatalf("Failed to obtain provisioning information: %v", err) 119 } 120 log.Printf("Provision info:\n%v", info) 121 122 // TODO(BEAM-8201): Simplify once flags are no longer used. 123 if info.GetLoggingEndpoint().GetUrl() != "" { 124 *loggingEndpoint = info.GetLoggingEndpoint().GetUrl() 125 } 126 if info.GetArtifactEndpoint().GetUrl() != "" { 127 *artifactEndpoint = info.GetArtifactEndpoint().GetUrl() 128 } 129 if info.GetControlEndpoint().GetUrl() != "" { 130 *controlEndpoint = info.GetControlEndpoint().GetUrl() 131 } 132 133 if *loggingEndpoint == "" { 134 log.Fatalf("No logging endpoint provided.") 135 } 136 if *artifactEndpoint == "" { 137 log.Fatalf("No artifact endpoint provided.") 138 } 139 if *controlEndpoint == "" { 140 log.Fatalf("No control endpoint provided.") 141 } 142 logger := &tools.Logger{Endpoint: *loggingEndpoint} 143 logger.Printf(ctx, "Initializing python harness: %v", strings.Join(os.Args, " ")) 144 145 // (1) Obtain the pipeline options 146 147 options, err := tools.ProtoToJSON(info.GetPipelineOptions()) 148 if err != nil { 149 logger.Fatalf(ctx, "Failed to convert pipeline options: %v", err) 150 } 151 152 // (2) Retrieve and install the staged packages. 153 // 154 // No log.Fatalf() from here on, otherwise deferred cleanups will not be called! 155 156 // Trap signals, so we can clean up properly. 157 signalChannel := make(chan os.Signal, 1) 158 signal.Notify(signalChannel, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) 159 160 // Create a separate virtual environment (with access to globally installed packages), unless disabled by the user. 161 // This improves usability on runners that persist the execution environment for the boot entrypoint between multiple pipeline executions. 162 if os.Getenv("RUN_PYTHON_SDK_IN_DEFAULT_ENVIRONMENT") == "" { 163 venvDir, err := setupVenv(ctx, logger, "/opt/apache/beam-venv", *id) 164 if err != nil { 165 return errors.New( 166 "failed to create a virtual environment. If running on Ubuntu systems, " + 167 "you might need to install `python3-venv` package. " + 168 "To run the SDK process in default environment instead, " + 169 "set the environment variable `RUN_PYTHON_SDK_IN_DEFAULT_ENVIRONMENT=1`. " + 170 "In custom Docker images, you can do that with an `ENV` statement. " + 171 fmt.Sprintf("Encountered error: %v", err)) 172 } 173 cleanupFunc := func() { 174 os.RemoveAll(venvDir) 175 logger.Printf(ctx, "Cleaned up temporary venv for worker %v.", *id) 176 } 177 defer cleanupFunc() 178 } 179 180 dir := filepath.Join(*semiPersistDir, "staged") 181 files, err := artifact.Materialize(ctx, *artifactEndpoint, info.GetDependencies(), info.GetRetrievalToken(), dir) 182 if err != nil { 183 return fmt.Errorf("failed to retrieve staged files: %v", err) 184 } 185 186 // TODO(herohde): the packages to install should be specified explicitly. It 187 // would also be possible to install the SDK in the Dockerfile. 188 fileNames := make([]string, len(files)) 189 requirementsFiles := []string{requirementsFile} 190 for i, v := range files { 191 name, _ := artifact.MustExtractFilePayload(v) 192 logger.Printf(ctx, "Found artifact: %s", name) 193 fileNames[i] = name 194 195 if v.RoleUrn == artifact.URNPipRequirementsFile { 196 requirementsFiles = append(requirementsFiles, name) 197 } 198 } 199 200 if setupErr := installSetupPackages(fileNames, dir, requirementsFiles); setupErr != nil { 201 return fmt.Errorf("failed to install required packages: %v", setupErr) 202 } 203 204 // (3) Invoke python 205 206 os.Setenv("PIPELINE_OPTIONS", options) 207 os.Setenv("SEMI_PERSISTENT_DIRECTORY", *semiPersistDir) 208 os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint})) 209 os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *controlEndpoint})) 210 os.Setenv("RUNNER_CAPABILITIES", strings.Join(info.GetRunnerCapabilities(), " ")) 211 212 if info.GetStatusEndpoint() != nil { 213 os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(info.GetStatusEndpoint())) 214 } 215 216 if metadata := info.GetMetadata(); metadata != nil { 217 if jobName, nameExists := metadata["job_name"]; nameExists { 218 os.Setenv("JOB_NAME", jobName) 219 } 220 if jobID, idExists := metadata["job_id"]; idExists { 221 os.Setenv("JOB_ID", jobID) 222 } 223 } 224 225 workerIds := append([]string{*id}, info.GetSiblingWorkerIds()...) 226 227 // Keep track of child PIDs for clean shutdown without zombies 228 childPids := struct { 229 v []int 230 canceled bool 231 mu sync.Mutex 232 }{v: make([]int, 0, len(workerIds))} 233 234 // Forward trapped signals to child process groups in order to terminate them gracefully and avoid zombies 235 go func() { 236 logger.Printf(ctx, "Received signal: %v", <-signalChannel) 237 childPids.mu.Lock() 238 childPids.canceled = true 239 for _, pid := range childPids.v { 240 go func(pid int) { 241 // This goroutine will be canceled if the main process exits before the 5 seconds 242 // have elapsed, i.e., as soon as all subprocesses have returned from Wait(). 243 time.Sleep(5 * time.Second) 244 if err := syscall.Kill(-pid, syscall.SIGKILL); err == nil { 245 logger.Printf(ctx, "Worker process %v did not respond, killed it.", pid) 246 } 247 }(pid) 248 syscall.Kill(-pid, syscall.SIGTERM) 249 } 250 childPids.mu.Unlock() 251 }() 252 253 args := []string{ 254 "-m", 255 sdkHarnessEntrypoint, 256 } 257 258 var wg sync.WaitGroup 259 wg.Add(len(workerIds)) 260 for _, workerId := range workerIds { 261 go func(workerId string) { 262 defer wg.Done() 263 264 errorCount := 0 265 for { 266 childPids.mu.Lock() 267 if childPids.canceled { 268 childPids.mu.Unlock() 269 return 270 } 271 logger.Printf(ctx, "Executing Python (worker %v): python %v", workerId, strings.Join(args, " ")) 272 cmd := StartCommandEnv(map[string]string{"WORKER_ID": workerId}, "python", args...) 273 childPids.v = append(childPids.v, cmd.Process.Pid) 274 childPids.mu.Unlock() 275 276 if err := cmd.Wait(); err != nil { 277 // Retry on fatal errors, like OOMs and segfaults, not just 278 // DoFns throwing exceptions. 279 errorCount += 1 280 if errorCount < 4 { 281 logger.Printf(ctx, "Python (worker %v) exited %v times: %v\nrestarting SDK process", 282 workerId, errorCount, err) 283 } else { 284 logger.Fatalf(ctx, "Python (worker %v) exited %v times: %v\nout of retries, failing container", 285 workerId, errorCount, err) 286 } 287 } else { 288 logger.Printf(ctx, "Python (worker %v) exited.", workerId) 289 break 290 } 291 } 292 }(workerId) 293 } 294 wg.Wait() 295 return nil 296 } 297 298 // Start a command object in a new process group with the given arguments with 299 // additional environment variables. It attaches stdio to the child process. 300 // Returns the process handle. 301 func StartCommandEnv(env map[string]string, prog string, args ...string) *exec.Cmd { 302 cmd := exec.Command(prog, args...) 303 cmd.Stdin = os.Stdin 304 cmd.Stdout = os.Stdout 305 cmd.Stderr = os.Stderr 306 if env != nil { 307 cmd.Env = os.Environ() 308 for k, v := range env { 309 cmd.Env = append(cmd.Env, k+"="+v) 310 } 311 } 312 313 // Create process group so we can clean up the whole subtree later without creating zombies 314 cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true, Pgid: 0} 315 cmd.Start() 316 return cmd 317 } 318 319 // setupVenv initializes a local Python venv and sets the corresponding env variables 320 func setupVenv(ctx context.Context, logger *tools.Logger, baseDir, workerId string) (string, error) { 321 dir := filepath.Join(baseDir, "beam-venv-worker-"+workerId) 322 logger.Printf(ctx, "Initializing temporary Python venv in %v", dir) 323 if _, err := os.Stat(dir); !os.IsNotExist(err) { 324 // Probably leftovers from a previous run 325 logger.Printf(ctx, "Cleaning up previous venv ...") 326 if err := os.RemoveAll(dir); err != nil { 327 return "", err 328 } 329 } 330 if err := os.MkdirAll(dir, 0750); err != nil { 331 return "", fmt.Errorf("failed to create Python venv directory: %s", err) 332 } 333 if err := execx.Execute("python", "-m", "venv", "--system-site-packages", dir); err != nil { 334 return "", fmt.Errorf("python venv initialization failed: %s", err) 335 } 336 337 os.Setenv("VIRTUAL_ENV", dir) 338 os.Setenv("PATH", strings.Join([]string{filepath.Join(dir, "bin"), os.Getenv("PATH")}, ":")) 339 return dir, nil 340 } 341 342 // setupAcceptableWheelSpecs setup wheel specs according to installed python version 343 func setupAcceptableWheelSpecs() error { 344 cmd := exec.Command("python", "-V") 345 stdoutStderr, err := cmd.CombinedOutput() 346 if err != nil { 347 return err 348 } 349 re := regexp.MustCompile(`Python (\d)\.(\d+).*`) 350 pyVersions := re.FindStringSubmatch(string(stdoutStderr[:])) 351 if len(pyVersions) != 3 { 352 return fmt.Errorf("cannot get parse Python version from %s", stdoutStderr) 353 } 354 pyVersion := fmt.Sprintf("%s%s", pyVersions[1], pyVersions[2]) 355 var wheelName string 356 switch pyVersion { 357 case "36", "37": 358 wheelName = fmt.Sprintf("cp%s-cp%sm-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", pyVersion, pyVersion) 359 default: 360 wheelName = fmt.Sprintf("cp%s-cp%s-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", pyVersion, pyVersion) 361 } 362 acceptableWhlSpecs = append(acceptableWhlSpecs, wheelName) 363 return nil 364 } 365 366 // installSetupPackages installs Beam SDK and user dependencies. 367 func installSetupPackages(files []string, workDir string, requirementsFiles []string) error { 368 log.Printf("Installing setup packages ...") 369 370 if err := setupAcceptableWheelSpecs(); err != nil { 371 log.Printf("Failed to setup acceptable wheel specs, leave it as empty: %v", err) 372 } 373 374 // Install the Dataflow Python SDK and worker packages. 375 // We install the extra requirements in case of using the beam sdk. These are ignored by pip 376 // if the user is using an SDK that does not provide these. 377 if err := installSdk(files, workDir, sdkSrcFile, acceptableWhlSpecs, false); err != nil { 378 return fmt.Errorf("failed to install SDK: %v", err) 379 } 380 // The staged files will not disappear due to restarts because workDir is a 381 // folder that is mapped to the host (and therefore survives restarts). 382 for _, f := range requirementsFiles { 383 if err := pipInstallRequirements(files, workDir, f); err != nil { 384 return fmt.Errorf("failed to install requirements: %v", err) 385 } 386 } 387 if err := installExtraPackages(files, extraPackagesFile, workDir); err != nil { 388 return fmt.Errorf("failed to install extra packages: %v", err) 389 } 390 if err := pipInstallPackage(files, workDir, workflowFile, false, true, nil); err != nil { 391 return fmt.Errorf("failed to install workflow: %v", err) 392 } 393 394 return nil 395 } 396 397 // processArtifactsInSetupOnlyMode installs the dependencies found in artifacts 398 // when flag --setup_only and --artifacts exist. The setup mode will only 399 // process the provided artifacts and skip the actual worker program start up. 400 // The mode is useful for building new images with dependencies pre-installed so 401 // that the installation can be skipped at the pipeline runtime. 402 func processArtifactsInSetupOnlyMode() { 403 if *artifacts == "" { 404 log.Fatal("No --artifacts provided along with --setup_only flag.") 405 } 406 workDir := filepath.Dir(*artifacts) 407 metadata, err := os.ReadFile(*artifacts) 408 if err != nil { 409 log.Fatalf("Unable to open artifacts metadata file %v with error %v", *artifacts, err) 410 } 411 var infoJsons []string 412 if err := json.Unmarshal(metadata, &infoJsons); err != nil { 413 log.Fatalf("Unable to parse metadata, error: %v", err) 414 } 415 416 files := make([]string, len(infoJsons)) 417 for i, info := range infoJsons { 418 var artifactInformation pipepb.ArtifactInformation 419 if err := jsonpb.UnmarshalString(info, &artifactInformation); err != nil { 420 log.Fatalf("Unable to unmarshal artifact information from json string %v", info) 421 } 422 423 // For now we only expect artifacts in file type. The condition should be revisited if the assumption is not valid any more. 424 if artifactInformation.GetTypeUrn() != standardArtifactFileTypeUrn { 425 log.Fatalf("Expect file artifact type in setup only mode, found %v.", artifactInformation.GetTypeUrn()) 426 } 427 filePayload := &pipepb.ArtifactFilePayload{} 428 if err := proto.Unmarshal(artifactInformation.GetTypePayload(), filePayload); err != nil { 429 log.Fatal("Unable to unmarshal artifact information type payload.") 430 } 431 files[i] = filePayload.GetPath() 432 } 433 if setupErr := installSetupPackages(files, workDir, []string{requirementsFile}); setupErr != nil { 434 log.Fatalf("Failed to install required packages: %v", setupErr) 435 } 436 }