github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/sdk_container_builder.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """SdkContainerImageBuilder builds the portable SDK container with dependencies.
    19  
    20  It copies the right boot dependencies, namely: apache beam sdk, python packages
    21  from requirements.txt, python packages from extra_packages.txt, workflow
    22  tarball, into the latest public python sdk container image, and run the
    23  dependencies installation in advance with the boot program in setup only mode
    24  to build the new image.
    25  """
    26  
    27  import json
    28  import logging
    29  import os
    30  import shutil
    31  import subprocess
    32  import sys
    33  import tarfile
    34  import tempfile
    35  import time
    36  import uuid
    37  from typing import Type
    38  
    39  from google.protobuf.json_format import MessageToJson
    40  
    41  from apache_beam import version as beam_version
    42  from apache_beam.internal.gcp.auth import get_service_credentials
    43  from apache_beam.internal.http_client import get_new_http
    44  from apache_beam.io.gcp.internal.clients import storage
    45  from apache_beam.options.pipeline_options import GoogleCloudOptions
    46  from apache_beam.options.pipeline_options import PipelineOptions  # pylint: disable=unused-import
    47  from apache_beam.options.pipeline_options import SetupOptions
    48  from apache_beam.options.pipeline_options import WorkerOptions
    49  from apache_beam.portability import common_urns
    50  from apache_beam.portability.api import beam_runner_api_pb2
    51  from apache_beam.runners.dataflow.internal.clients import cloudbuild
    52  from apache_beam.runners.portability.stager import Stager
    53  from apache_beam.utils import plugin
    54  
    55  ARTIFACTS_CONTAINER_DIR = '/opt/apache/beam/artifacts'
    56  ARTIFACTS_MANIFEST_FILE = 'artifacts_info.json'
    57  SDK_CONTAINER_ENTRYPOINT = '/opt/apache/beam/boot'
    58  DOCKERFILE_TEMPLATE = (
    59      """FROM {base_image}
    60  RUN mkdir -p {workdir}
    61  COPY ./* {workdir}/
    62  RUN {entrypoint} --setup_only --artifacts {workdir}/{manifest_file}
    63  """)
    64  
    65  SOURCE_FOLDER = 'source'
    66  _LOGGER = logging.getLogger(__name__)
    67  
    68  
    69  class SdkContainerImageBuilder(plugin.BeamPlugin):
    70    def __init__(self, options):
    71      self._options = options
    72      self._docker_registry_push_url = self._options.view_as(
    73          SetupOptions).docker_registry_push_url
    74      version = (
    75          beam_version.__version__
    76          if 'dev' not in beam_version.__version__ else 'latest')
    77      self._base_image = (
    78          self._options.view_as(WorkerOptions).sdk_container_image or
    79          'apache/beam_python%s.%s_sdk:%s' %
    80          (sys.version_info[0], sys.version_info[1], version))
    81      self._temp_src_dir = None
    82  
    83    def _build(self):
    84      container_image_tag = str(uuid.uuid4())
    85      container_image_name = os.path.join(
    86          self._docker_registry_push_url or '',
    87          'beam_python_prebuilt_sdk:%s' % container_image_tag)
    88      with tempfile.TemporaryDirectory() as temp_folder:
    89        self._temp_src_dir = temp_folder
    90        self._prepare_dependencies()
    91        self._invoke_docker_build_and_push(container_image_name)
    92  
    93      return container_image_name
    94  
    95    def _prepare_dependencies(self):
    96      with tempfile.TemporaryDirectory() as tmp:
    97        artifacts = Stager.create_job_resources(self._options, tmp)
    98        resources = Stager.extract_staging_tuple_iter(artifacts)
    99        # make a copy of the staged artifacts into the temp source folder.
   100        file_names = []
   101        for path, name, _ in resources:
   102          shutil.copyfile(path, os.path.join(self._temp_src_dir, name))
   103          file_names.append(name)
   104        with open(os.path.join(self._temp_src_dir, 'Dockerfile'), 'w') as file:
   105          file.write(
   106              DOCKERFILE_TEMPLATE.format(
   107                  base_image=self._base_image,
   108                  workdir=ARTIFACTS_CONTAINER_DIR,
   109                  manifest_file=ARTIFACTS_MANIFEST_FILE,
   110                  entrypoint=SDK_CONTAINER_ENTRYPOINT))
   111        self._generate_artifacts_manifests_json_file(
   112            file_names, self._temp_src_dir)
   113  
   114    def _invoke_docker_build_and_push(self, container_image_name):
   115      raise NotImplementedError
   116  
   117    @classmethod
   118    def _builder_key(cls) -> str:
   119      return f'{cls.__module__}.{cls.__name__}'
   120  
   121    @staticmethod
   122    def _generate_artifacts_manifests_json_file(file_names, temp_dir):
   123      infos = []
   124      for name in file_names:
   125        info = beam_runner_api_pb2.ArtifactInformation(
   126            type_urn=common_urns.StandardArtifacts.Types.FILE.urn,
   127            type_payload=beam_runner_api_pb2.ArtifactFilePayload(
   128                path=name).SerializeToString(),
   129        )
   130        infos.append(json.dumps(MessageToJson(info)))
   131      with open(os.path.join(temp_dir, ARTIFACTS_MANIFEST_FILE), 'w') as file:
   132        file.write('[\n' + ',\n'.join(infos) + '\n]')
   133  
   134    @classmethod
   135    def build_container_image(cls, pipeline_options: PipelineOptions) -> str:
   136      setup_options = pipeline_options.view_as(SetupOptions)
   137      container_build_engine = setup_options.prebuild_sdk_container_engine
   138      builder_cls = cls._get_subclass_by_key(container_build_engine)
   139      builder = builder_cls(pipeline_options)
   140      return builder._build()
   141  
   142    @classmethod
   143    def _get_subclass_by_key(cls, key: str) -> Type['SdkContainerImageBuilder']:
   144      available_builders = [
   145          subclass for subclass in cls.get_all_subclasses()
   146          if subclass._builder_key() == key
   147      ]
   148      if not available_builders:
   149        available_builder_keys = [
   150            subclass._builder_key() for subclass in cls.get_all_subclasses()
   151        ]
   152        raise ValueError(
   153            f'Cannot find SDK builder type {key} in '
   154            f'{available_builder_keys}')
   155      elif len(available_builders) > 1:
   156        raise ValueError(f'Found multiple builders under key {key}')
   157      return available_builders[0]
   158  
   159  
   160  class _SdkContainerImageLocalBuilder(SdkContainerImageBuilder):
   161    """SdkContainerLocalBuilder builds the sdk container image with local
   162    docker."""
   163    @classmethod
   164    def _builder_key(cls):
   165      return 'local_docker'
   166  
   167    def _invoke_docker_build_and_push(self, container_image_name):
   168      try:
   169        _LOGGER.info("Building sdk container, this may take a few minutes...")
   170        now = time.time()
   171        subprocess.run(['docker', 'build', '.', '-t', container_image_name],
   172                       check=True,
   173                       cwd=self._temp_src_dir)
   174      except subprocess.CalledProcessError as err:
   175        raise RuntimeError(
   176            'Failed to build sdk container with local docker, '
   177            'stderr:\n %s.' % err.stderr)
   178      else:
   179        _LOGGER.info(
   180            "Successfully built %s in %.2f seconds" %
   181            (container_image_name, time.time() - now))
   182  
   183      if self._docker_registry_push_url:
   184        _LOGGER.info("Pushing prebuilt sdk container...")
   185        try:
   186          subprocess.run(['docker', 'push', container_image_name], check=True)
   187        except subprocess.CalledProcessError as err:
   188          raise RuntimeError(
   189              'Failed to push prebuilt sdk container %s, stderr: \n%s' %
   190              (container_image_name, err.stderr))
   191        _LOGGER.info(
   192            "Successfully pushed %s in %.2f seconds" %
   193            (container_image_name, time.time() - now))
   194      else:
   195        _LOGGER.info(
   196            "no --docker_registry_push_url option is specified in pipeline "
   197            "options, specify it if the new image is intended to be "
   198            "pushed to a registry.")
   199  
   200  
   201  class _SdkContainerImageCloudBuilder(SdkContainerImageBuilder):
   202    """SdkContainerLocalBuilder builds the sdk container image with google cloud
   203    build."""
   204    def __init__(self, options):
   205      super().__init__(options)
   206      self._google_cloud_options = options.view_as(GoogleCloudOptions)
   207      self._cloud_build_machine_type = self._get_cloud_build_machine_type_enum(
   208          options.view_as(SetupOptions).cloud_build_machine_type)
   209      if self._google_cloud_options.no_auth:
   210        credentials = None
   211      else:
   212        credentials = get_service_credentials(options)
   213      self._storage_client = storage.StorageV1(
   214          url='https://www.googleapis.com/storage/v1',
   215          credentials=credentials,
   216          get_credentials=(not self._google_cloud_options.no_auth),
   217          http=get_new_http(),
   218          response_encoding='utf8')
   219      self._cloudbuild_client = cloudbuild.CloudbuildV1(
   220          credentials=credentials,
   221          get_credentials=(not self._google_cloud_options.no_auth),
   222          http=get_new_http(),
   223          response_encoding='utf8')
   224      if not self._docker_registry_push_url:
   225        self._docker_registry_push_url = (
   226            'gcr.io/%s/prebuilt_beam_sdk' % self._google_cloud_options.project)
   227  
   228    @classmethod
   229    def _builder_key(cls):
   230      return 'cloud_build'
   231  
   232    def _invoke_docker_build_and_push(self, container_image_name):
   233      project_id = self._google_cloud_options.project
   234      temp_location = self._google_cloud_options.temp_location
   235      # google cloud build service expects all the build source file to be
   236      # compressed into a tarball.
   237      tarball_path = os.path.join(self._temp_src_dir, '%s.tgz' % SOURCE_FOLDER)
   238      self._make_tarfile(tarball_path, self._temp_src_dir)
   239      _LOGGER.info(
   240          "Compressed source files for building sdk container at %s" %
   241          tarball_path)
   242  
   243      container_image_tag = container_image_name.split(':')[-1]
   244      gcs_location = os.path.join(
   245          temp_location, '%s-%s.tgz' % (SOURCE_FOLDER, container_image_tag))
   246      self._upload_to_gcs(tarball_path, gcs_location)
   247  
   248      build = cloudbuild.Build()
   249      if self._cloud_build_machine_type:
   250        build.options = cloudbuild.BuildOptions()
   251        build.options.machineType = self._cloud_build_machine_type
   252      build.steps = []
   253      step = cloudbuild.BuildStep()
   254      step.name = 'gcr.io/kaniko-project/executor:latest'
   255      step.args = ['--destination=' + container_image_name, '--cache=true']
   256      step.dir = SOURCE_FOLDER
   257  
   258      build.steps.append(step)
   259  
   260      source = cloudbuild.Source()
   261      source.storageSource = cloudbuild.StorageSource()
   262      gcs_bucket, gcs_object = self._get_gcs_bucket_and_name(gcs_location)
   263      source.storageSource.bucket = os.path.join(gcs_bucket)
   264      source.storageSource.object = gcs_object
   265      build.source = source
   266      # TODO(zyichi): make timeout configurable
   267      build.timeout = '7200s'
   268  
   269      now = time.time()
   270      # operation = client.create_build(project_id=project_id, build=build)
   271      request = cloudbuild.CloudbuildProjectsBuildsCreateRequest(
   272          projectId=project_id, build=build)
   273      build = self._cloudbuild_client.projects_builds.Create(request)
   274      build_id, log_url = self._get_cloud_build_id_and_log_url(build.metadata)
   275      _LOGGER.info(
   276          'Building sdk container with Google Cloud Build, this may '
   277          'take a few minutes, you may check build log at %s' % log_url)
   278  
   279      # block until build finish, if build fails exception will be raised and
   280      # stops the job submission.
   281      response = self._cloudbuild_client.projects_builds.Get(
   282          cloudbuild.CloudbuildProjectsBuildsGetRequest(
   283              id=build_id, projectId=project_id))
   284      while response.status in [cloudbuild.Build.StatusValueValuesEnum.QUEUED,
   285                                cloudbuild.Build.StatusValueValuesEnum.PENDING,
   286                                cloudbuild.Build.StatusValueValuesEnum.WORKING]:
   287        time.sleep(10)
   288        response = self._cloudbuild_client.projects_builds.Get(
   289            cloudbuild.CloudbuildProjectsBuildsGetRequest(
   290                id=build_id, projectId=project_id))
   291  
   292      if response.status != cloudbuild.Build.StatusValueValuesEnum.SUCCESS:
   293        raise RuntimeError(
   294            'Failed to build python sdk container image on google cloud build, '
   295            'please check build log for error.')
   296  
   297      _LOGGER.info(
   298          "Python SDK container pre-build finished in %.2f seconds" %
   299          (time.time() - now))
   300      _LOGGER.info(
   301          "Python SDK container built and pushed as %s." % container_image_name)
   302  
   303    def _upload_to_gcs(self, local_file_path, gcs_location):
   304      gcs_bucket, gcs_object = self._get_gcs_bucket_and_name(gcs_location)
   305      request = storage.StorageObjectsInsertRequest(
   306          bucket=gcs_bucket, name=gcs_object)
   307      _LOGGER.info('Starting GCS upload to %s...', gcs_location)
   308      total_size = os.path.getsize(local_file_path)
   309      from apitools.base.py import exceptions
   310      try:
   311        with open(local_file_path, 'rb') as stream:
   312          upload = storage.Upload(stream, 'application/octet-stream', total_size)
   313          self._storage_client.objects.Insert(request, upload=upload)
   314      except exceptions.HttpError as e:
   315        reportable_errors = {
   316            403: 'access denied',
   317            404: 'bucket not found',
   318        }
   319        if e.status_code in reportable_errors:
   320          raise IOError((
   321              'Could not upload to GCS path %s: %s. Please verify '
   322              'that credentials are valid and that you have write '
   323              'access to the specified path.') %
   324                        (gcs_location, reportable_errors[e.status_code]))
   325        raise
   326      _LOGGER.info('Completed GCS upload to %s.', gcs_location)
   327  
   328    def _get_cloud_build_id_and_log_url(self, metadata):
   329      id = None
   330      log_url = None
   331      for item in metadata.additionalProperties:
   332        if item.key == 'build':
   333          for field in item.value.object_value.properties:
   334            if field.key == 'logUrl':
   335              log_url = field.value.string_value
   336            if field.key == 'id':
   337              id = field.value.string_value
   338      return id, log_url
   339  
   340    @staticmethod
   341    def _get_gcs_bucket_and_name(gcs_location):
   342      return gcs_location[5:].split('/', 1)
   343  
   344    @staticmethod
   345    def _make_tarfile(output_filename, source_dir):
   346      with tarfile.open(output_filename, "w:gz") as tar:
   347        tar.add(source_dir, arcname=SOURCE_FOLDER)
   348  
   349    @staticmethod
   350    def _get_cloud_build_machine_type_enum(machine_type: str):
   351      if not machine_type:
   352        return None
   353      mappings = {
   354          'n1-highcpu-8': cloudbuild.BuildOptions.MachineTypeValueValuesEnum.
   355          N1_HIGHCPU_8,
   356          'n1-highcpu-32': cloudbuild.BuildOptions.MachineTypeValueValuesEnum.
   357          N1_HIGHCPU_32,
   358          'e2-highcpu-8': cloudbuild.BuildOptions.MachineTypeValueValuesEnum.
   359          E2_HIGHCPU_8,
   360          'e2-highcpu-32': cloudbuild.BuildOptions.MachineTypeValueValuesEnum.
   361          E2_HIGHCPU_32
   362      }
   363      if machine_type.lower() in mappings:
   364        return mappings[machine_type.lower()]
   365      else:
   366        raise ValueError(
   367            'Unknown Cloud Build Machine Type option, please specify one of '
   368            '[n1-highcpu-8, n1-highcpu-32, e2-highcpu-8, e2-highcpu-32].')