github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/sdk_container_builder.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """SdkContainerImageBuilder builds the portable SDK container with dependencies. 19 20 It copies the right boot dependencies, namely: apache beam sdk, python packages 21 from requirements.txt, python packages from extra_packages.txt, workflow 22 tarball, into the latest public python sdk container image, and run the 23 dependencies installation in advance with the boot program in setup only mode 24 to build the new image. 25 """ 26 27 import json 28 import logging 29 import os 30 import shutil 31 import subprocess 32 import sys 33 import tarfile 34 import tempfile 35 import time 36 import uuid 37 from typing import Type 38 39 from google.protobuf.json_format import MessageToJson 40 41 from apache_beam import version as beam_version 42 from apache_beam.internal.gcp.auth import get_service_credentials 43 from apache_beam.internal.http_client import get_new_http 44 from apache_beam.io.gcp.internal.clients import storage 45 from apache_beam.options.pipeline_options import GoogleCloudOptions 46 from apache_beam.options.pipeline_options import PipelineOptions # pylint: disable=unused-import 47 from apache_beam.options.pipeline_options import SetupOptions 48 from apache_beam.options.pipeline_options import WorkerOptions 49 from apache_beam.portability import common_urns 50 from apache_beam.portability.api import beam_runner_api_pb2 51 from apache_beam.runners.dataflow.internal.clients import cloudbuild 52 from apache_beam.runners.portability.stager import Stager 53 from apache_beam.utils import plugin 54 55 ARTIFACTS_CONTAINER_DIR = '/opt/apache/beam/artifacts' 56 ARTIFACTS_MANIFEST_FILE = 'artifacts_info.json' 57 SDK_CONTAINER_ENTRYPOINT = '/opt/apache/beam/boot' 58 DOCKERFILE_TEMPLATE = ( 59 """FROM {base_image} 60 RUN mkdir -p {workdir} 61 COPY ./* {workdir}/ 62 RUN {entrypoint} --setup_only --artifacts {workdir}/{manifest_file} 63 """) 64 65 SOURCE_FOLDER = 'source' 66 _LOGGER = logging.getLogger(__name__) 67 68 69 class SdkContainerImageBuilder(plugin.BeamPlugin): 70 def __init__(self, options): 71 self._options = options 72 self._docker_registry_push_url = self._options.view_as( 73 SetupOptions).docker_registry_push_url 74 version = ( 75 beam_version.__version__ 76 if 'dev' not in beam_version.__version__ else 'latest') 77 self._base_image = ( 78 self._options.view_as(WorkerOptions).sdk_container_image or 79 'apache/beam_python%s.%s_sdk:%s' % 80 (sys.version_info[0], sys.version_info[1], version)) 81 self._temp_src_dir = None 82 83 def _build(self): 84 container_image_tag = str(uuid.uuid4()) 85 container_image_name = os.path.join( 86 self._docker_registry_push_url or '', 87 'beam_python_prebuilt_sdk:%s' % container_image_tag) 88 with tempfile.TemporaryDirectory() as temp_folder: 89 self._temp_src_dir = temp_folder 90 self._prepare_dependencies() 91 self._invoke_docker_build_and_push(container_image_name) 92 93 return container_image_name 94 95 def _prepare_dependencies(self): 96 with tempfile.TemporaryDirectory() as tmp: 97 artifacts = Stager.create_job_resources(self._options, tmp) 98 resources = Stager.extract_staging_tuple_iter(artifacts) 99 # make a copy of the staged artifacts into the temp source folder. 100 file_names = [] 101 for path, name, _ in resources: 102 shutil.copyfile(path, os.path.join(self._temp_src_dir, name)) 103 file_names.append(name) 104 with open(os.path.join(self._temp_src_dir, 'Dockerfile'), 'w') as file: 105 file.write( 106 DOCKERFILE_TEMPLATE.format( 107 base_image=self._base_image, 108 workdir=ARTIFACTS_CONTAINER_DIR, 109 manifest_file=ARTIFACTS_MANIFEST_FILE, 110 entrypoint=SDK_CONTAINER_ENTRYPOINT)) 111 self._generate_artifacts_manifests_json_file( 112 file_names, self._temp_src_dir) 113 114 def _invoke_docker_build_and_push(self, container_image_name): 115 raise NotImplementedError 116 117 @classmethod 118 def _builder_key(cls) -> str: 119 return f'{cls.__module__}.{cls.__name__}' 120 121 @staticmethod 122 def _generate_artifacts_manifests_json_file(file_names, temp_dir): 123 infos = [] 124 for name in file_names: 125 info = beam_runner_api_pb2.ArtifactInformation( 126 type_urn=common_urns.StandardArtifacts.Types.FILE.urn, 127 type_payload=beam_runner_api_pb2.ArtifactFilePayload( 128 path=name).SerializeToString(), 129 ) 130 infos.append(json.dumps(MessageToJson(info))) 131 with open(os.path.join(temp_dir, ARTIFACTS_MANIFEST_FILE), 'w') as file: 132 file.write('[\n' + ',\n'.join(infos) + '\n]') 133 134 @classmethod 135 def build_container_image(cls, pipeline_options: PipelineOptions) -> str: 136 setup_options = pipeline_options.view_as(SetupOptions) 137 container_build_engine = setup_options.prebuild_sdk_container_engine 138 builder_cls = cls._get_subclass_by_key(container_build_engine) 139 builder = builder_cls(pipeline_options) 140 return builder._build() 141 142 @classmethod 143 def _get_subclass_by_key(cls, key: str) -> Type['SdkContainerImageBuilder']: 144 available_builders = [ 145 subclass for subclass in cls.get_all_subclasses() 146 if subclass._builder_key() == key 147 ] 148 if not available_builders: 149 available_builder_keys = [ 150 subclass._builder_key() for subclass in cls.get_all_subclasses() 151 ] 152 raise ValueError( 153 f'Cannot find SDK builder type {key} in ' 154 f'{available_builder_keys}') 155 elif len(available_builders) > 1: 156 raise ValueError(f'Found multiple builders under key {key}') 157 return available_builders[0] 158 159 160 class _SdkContainerImageLocalBuilder(SdkContainerImageBuilder): 161 """SdkContainerLocalBuilder builds the sdk container image with local 162 docker.""" 163 @classmethod 164 def _builder_key(cls): 165 return 'local_docker' 166 167 def _invoke_docker_build_and_push(self, container_image_name): 168 try: 169 _LOGGER.info("Building sdk container, this may take a few minutes...") 170 now = time.time() 171 subprocess.run(['docker', 'build', '.', '-t', container_image_name], 172 check=True, 173 cwd=self._temp_src_dir) 174 except subprocess.CalledProcessError as err: 175 raise RuntimeError( 176 'Failed to build sdk container with local docker, ' 177 'stderr:\n %s.' % err.stderr) 178 else: 179 _LOGGER.info( 180 "Successfully built %s in %.2f seconds" % 181 (container_image_name, time.time() - now)) 182 183 if self._docker_registry_push_url: 184 _LOGGER.info("Pushing prebuilt sdk container...") 185 try: 186 subprocess.run(['docker', 'push', container_image_name], check=True) 187 except subprocess.CalledProcessError as err: 188 raise RuntimeError( 189 'Failed to push prebuilt sdk container %s, stderr: \n%s' % 190 (container_image_name, err.stderr)) 191 _LOGGER.info( 192 "Successfully pushed %s in %.2f seconds" % 193 (container_image_name, time.time() - now)) 194 else: 195 _LOGGER.info( 196 "no --docker_registry_push_url option is specified in pipeline " 197 "options, specify it if the new image is intended to be " 198 "pushed to a registry.") 199 200 201 class _SdkContainerImageCloudBuilder(SdkContainerImageBuilder): 202 """SdkContainerLocalBuilder builds the sdk container image with google cloud 203 build.""" 204 def __init__(self, options): 205 super().__init__(options) 206 self._google_cloud_options = options.view_as(GoogleCloudOptions) 207 self._cloud_build_machine_type = self._get_cloud_build_machine_type_enum( 208 options.view_as(SetupOptions).cloud_build_machine_type) 209 if self._google_cloud_options.no_auth: 210 credentials = None 211 else: 212 credentials = get_service_credentials(options) 213 self._storage_client = storage.StorageV1( 214 url='https://www.googleapis.com/storage/v1', 215 credentials=credentials, 216 get_credentials=(not self._google_cloud_options.no_auth), 217 http=get_new_http(), 218 response_encoding='utf8') 219 self._cloudbuild_client = cloudbuild.CloudbuildV1( 220 credentials=credentials, 221 get_credentials=(not self._google_cloud_options.no_auth), 222 http=get_new_http(), 223 response_encoding='utf8') 224 if not self._docker_registry_push_url: 225 self._docker_registry_push_url = ( 226 'gcr.io/%s/prebuilt_beam_sdk' % self._google_cloud_options.project) 227 228 @classmethod 229 def _builder_key(cls): 230 return 'cloud_build' 231 232 def _invoke_docker_build_and_push(self, container_image_name): 233 project_id = self._google_cloud_options.project 234 temp_location = self._google_cloud_options.temp_location 235 # google cloud build service expects all the build source file to be 236 # compressed into a tarball. 237 tarball_path = os.path.join(self._temp_src_dir, '%s.tgz' % SOURCE_FOLDER) 238 self._make_tarfile(tarball_path, self._temp_src_dir) 239 _LOGGER.info( 240 "Compressed source files for building sdk container at %s" % 241 tarball_path) 242 243 container_image_tag = container_image_name.split(':')[-1] 244 gcs_location = os.path.join( 245 temp_location, '%s-%s.tgz' % (SOURCE_FOLDER, container_image_tag)) 246 self._upload_to_gcs(tarball_path, gcs_location) 247 248 build = cloudbuild.Build() 249 if self._cloud_build_machine_type: 250 build.options = cloudbuild.BuildOptions() 251 build.options.machineType = self._cloud_build_machine_type 252 build.steps = [] 253 step = cloudbuild.BuildStep() 254 step.name = 'gcr.io/kaniko-project/executor:latest' 255 step.args = ['--destination=' + container_image_name, '--cache=true'] 256 step.dir = SOURCE_FOLDER 257 258 build.steps.append(step) 259 260 source = cloudbuild.Source() 261 source.storageSource = cloudbuild.StorageSource() 262 gcs_bucket, gcs_object = self._get_gcs_bucket_and_name(gcs_location) 263 source.storageSource.bucket = os.path.join(gcs_bucket) 264 source.storageSource.object = gcs_object 265 build.source = source 266 # TODO(zyichi): make timeout configurable 267 build.timeout = '7200s' 268 269 now = time.time() 270 # operation = client.create_build(project_id=project_id, build=build) 271 request = cloudbuild.CloudbuildProjectsBuildsCreateRequest( 272 projectId=project_id, build=build) 273 build = self._cloudbuild_client.projects_builds.Create(request) 274 build_id, log_url = self._get_cloud_build_id_and_log_url(build.metadata) 275 _LOGGER.info( 276 'Building sdk container with Google Cloud Build, this may ' 277 'take a few minutes, you may check build log at %s' % log_url) 278 279 # block until build finish, if build fails exception will be raised and 280 # stops the job submission. 281 response = self._cloudbuild_client.projects_builds.Get( 282 cloudbuild.CloudbuildProjectsBuildsGetRequest( 283 id=build_id, projectId=project_id)) 284 while response.status in [cloudbuild.Build.StatusValueValuesEnum.QUEUED, 285 cloudbuild.Build.StatusValueValuesEnum.PENDING, 286 cloudbuild.Build.StatusValueValuesEnum.WORKING]: 287 time.sleep(10) 288 response = self._cloudbuild_client.projects_builds.Get( 289 cloudbuild.CloudbuildProjectsBuildsGetRequest( 290 id=build_id, projectId=project_id)) 291 292 if response.status != cloudbuild.Build.StatusValueValuesEnum.SUCCESS: 293 raise RuntimeError( 294 'Failed to build python sdk container image on google cloud build, ' 295 'please check build log for error.') 296 297 _LOGGER.info( 298 "Python SDK container pre-build finished in %.2f seconds" % 299 (time.time() - now)) 300 _LOGGER.info( 301 "Python SDK container built and pushed as %s." % container_image_name) 302 303 def _upload_to_gcs(self, local_file_path, gcs_location): 304 gcs_bucket, gcs_object = self._get_gcs_bucket_and_name(gcs_location) 305 request = storage.StorageObjectsInsertRequest( 306 bucket=gcs_bucket, name=gcs_object) 307 _LOGGER.info('Starting GCS upload to %s...', gcs_location) 308 total_size = os.path.getsize(local_file_path) 309 from apitools.base.py import exceptions 310 try: 311 with open(local_file_path, 'rb') as stream: 312 upload = storage.Upload(stream, 'application/octet-stream', total_size) 313 self._storage_client.objects.Insert(request, upload=upload) 314 except exceptions.HttpError as e: 315 reportable_errors = { 316 403: 'access denied', 317 404: 'bucket not found', 318 } 319 if e.status_code in reportable_errors: 320 raise IOError(( 321 'Could not upload to GCS path %s: %s. Please verify ' 322 'that credentials are valid and that you have write ' 323 'access to the specified path.') % 324 (gcs_location, reportable_errors[e.status_code])) 325 raise 326 _LOGGER.info('Completed GCS upload to %s.', gcs_location) 327 328 def _get_cloud_build_id_and_log_url(self, metadata): 329 id = None 330 log_url = None 331 for item in metadata.additionalProperties: 332 if item.key == 'build': 333 for field in item.value.object_value.properties: 334 if field.key == 'logUrl': 335 log_url = field.value.string_value 336 if field.key == 'id': 337 id = field.value.string_value 338 return id, log_url 339 340 @staticmethod 341 def _get_gcs_bucket_and_name(gcs_location): 342 return gcs_location[5:].split('/', 1) 343 344 @staticmethod 345 def _make_tarfile(output_filename, source_dir): 346 with tarfile.open(output_filename, "w:gz") as tar: 347 tar.add(source_dir, arcname=SOURCE_FOLDER) 348 349 @staticmethod 350 def _get_cloud_build_machine_type_enum(machine_type: str): 351 if not machine_type: 352 return None 353 mappings = { 354 'n1-highcpu-8': cloudbuild.BuildOptions.MachineTypeValueValuesEnum. 355 N1_HIGHCPU_8, 356 'n1-highcpu-32': cloudbuild.BuildOptions.MachineTypeValueValuesEnum. 357 N1_HIGHCPU_32, 358 'e2-highcpu-8': cloudbuild.BuildOptions.MachineTypeValueValuesEnum. 359 E2_HIGHCPU_8, 360 'e2-highcpu-32': cloudbuild.BuildOptions.MachineTypeValueValuesEnum. 361 E2_HIGHCPU_32 362 } 363 if machine_type.lower() in mappings: 364 return mappings[machine_type.lower()] 365 else: 366 raise ValueError( 367 'Unknown Cloud Build Machine Type option, please specify one of ' 368 '[n1-highcpu-8, n1-highcpu-32, e2-highcpu-8, e2-highcpu-32].')