github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/internal/apiclient.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/internal/apiclient.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """ For internal use only. No backwards compatibility guarantees.
    19  
    20  Dataflow client utility functions."""
    21  
    22  # pytype: skip-file
    23  # To regenerate the client:
    24  # pip install google-apitools[cli]
    25  # gen_client --discovery_url=cloudbuild.v1 --overwrite \
    26  #  --outdir=apache_beam/runners/dataflow/internal/clients/cloudbuild \
    27  #  --root_package=. client
    28  
    29  import ast
    30  import codecs
    31  from functools import partial
    32  import getpass
    33  import hashlib
    34  import io
    35  import json
    36  import logging
    37  import os
    38  import random
    39  import string
    40  
    41  import pkg_resources
    42  import re
    43  import sys
    44  import time
    45  import warnings
    46  from copy import copy
    47  from datetime import datetime
    48  
    49  from apitools.base.py import encoding
    50  from apitools.base.py import exceptions
    51  
    52  from apache_beam import version as beam_version
    53  from apache_beam.internal.gcp.auth import get_service_credentials
    54  from apache_beam.internal.gcp.json_value import to_json_value
    55  from apache_beam.internal.http_client import get_new_http
    56  from apache_beam.io.filesystems import FileSystems
    57  from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
    58  from apache_beam.io.gcp.internal.clients import storage
    59  from apache_beam.options.pipeline_options import DebugOptions
    60  from apache_beam.options.pipeline_options import GoogleCloudOptions
    61  from apache_beam.options.pipeline_options import StandardOptions
    62  from apache_beam.options.pipeline_options import WorkerOptions
    63  from apache_beam.portability import common_urns
    64  from apache_beam.portability.api import beam_runner_api_pb2
    65  from apache_beam.runners.common import validate_pipeline_graph
    66  from apache_beam.runners.dataflow.internal import names
    67  from apache_beam.runners.dataflow.internal.clients import dataflow
    68  from apache_beam.runners.dataflow.internal.names import PropertyNames
    69  from apache_beam.runners.internal import names as shared_names
    70  from apache_beam.runners.portability.stager import Stager
    71  from apache_beam.transforms import DataflowDistributionCounter
    72  from apache_beam.transforms import cy_combiners
    73  from apache_beam.transforms.display import DisplayData
    74  from apache_beam.transforms.environments import is_apache_beam_container
    75  from apache_beam.utils import retry
    76  from apache_beam.utils import proto_utils
    77  
    78  # Environment version information. It is passed to the service during a
    79  # a job submission and is used by the service to establish what features
    80  # are expected by the workers.
    81  _LEGACY_ENVIRONMENT_MAJOR_VERSION = '8'
    82  _FNAPI_ENVIRONMENT_MAJOR_VERSION = '8'
    83  
    84  _LOGGER = logging.getLogger(__name__)
    85  
    86  _PYTHON_VERSIONS_SUPPORTED_BY_DATAFLOW = ['3.7', '3.8', '3.9', '3.10', '3.11']
    87  
    88  
    89  class Step(object):
    90    """Wrapper for a dataflow Step protobuf."""
    91    def __init__(self, step_kind, step_name, additional_properties=None):
    92      self.step_kind = step_kind
    93      self.step_name = step_name
    94      self.proto = dataflow.Step(kind=step_kind, name=step_name)
    95      self.proto.properties = {}
    96      self._additional_properties = []
    97  
    98      if additional_properties is not None:
    99        for (n, v, t) in additional_properties:
   100          self.add_property(n, v, t)
   101  
   102    def add_property(self, name, value, with_type=False):
   103      self._additional_properties.append((name, value, with_type))
   104      self.proto.properties.additionalProperties.append(
   105          dataflow.Step.PropertiesValue.AdditionalProperty(
   106              key=name, value=to_json_value(value, with_type=with_type)))
   107  
   108    def _get_outputs(self):
   109      """Returns a list of all output labels for a step."""
   110      outputs = []
   111      for p in self.proto.properties.additionalProperties:
   112        if p.key == PropertyNames.OUTPUT_INFO:
   113          for entry in p.value.array_value.entries:
   114            for entry_prop in entry.object_value.properties:
   115              if entry_prop.key == PropertyNames.OUTPUT_NAME:
   116                outputs.append(entry_prop.value.string_value)
   117      return outputs
   118  
   119    def __reduce__(self):
   120      """Reduce hook for pickling the Step class more easily."""
   121      return (Step, (self.step_kind, self.step_name, self._additional_properties))
   122  
   123    def get_output(self, tag=None):
   124      """Returns name if it is one of the outputs or first output if name is None.
   125  
   126      Args:
   127        tag: tag of the output as a string or None if we want to get the
   128          name of the first output.
   129  
   130      Returns:
   131        The name of the output associated with the tag or the first output
   132        if tag was None.
   133  
   134      Raises:
   135        ValueError: if the tag does not exist within outputs.
   136      """
   137      outputs = self._get_outputs()
   138      if tag is None or len(outputs) == 1:
   139        return outputs[0]
   140      else:
   141        if tag not in outputs:
   142          raise ValueError('Cannot find named output: %s in %s.' % (tag, outputs))
   143        return tag
   144  
   145  
   146  class Environment(object):
   147    """Wrapper for a dataflow Environment protobuf."""
   148    def __init__(
   149        self,
   150        packages,
   151        options,
   152        environment_version,
   153        proto_pipeline_staged_url,
   154        proto_pipeline=None):
   155      from apache_beam.runners.dataflow.dataflow_runner import _is_runner_v2
   156      self.standard_options = options.view_as(StandardOptions)
   157      self.google_cloud_options = options.view_as(GoogleCloudOptions)
   158      self.worker_options = options.view_as(WorkerOptions)
   159      self.debug_options = options.view_as(DebugOptions)
   160      self.pipeline_url = proto_pipeline_staged_url
   161      self.proto = dataflow.Environment()
   162      self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
   163      self.proto.dataset = '{}/cloud_dataflow'.format(
   164          GoogleCloudOptions.BIGQUERY_API_SERVICE)
   165      self.proto.tempStoragePrefix = (
   166          self.google_cloud_options.temp_location.replace(
   167              'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
   168      if self.worker_options.worker_region:
   169        self.proto.workerRegion = self.worker_options.worker_region
   170      if self.worker_options.worker_zone:
   171        self.proto.workerZone = self.worker_options.worker_zone
   172      # User agent information.
   173      self.proto.userAgent = dataflow.Environment.UserAgentValue()
   174      self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint
   175      self._proto_pipeline = proto_pipeline
   176  
   177      if self.google_cloud_options.service_account_email:
   178        self.proto.serviceAccountEmail = (
   179            self.google_cloud_options.service_account_email)
   180      if self.google_cloud_options.dataflow_kms_key:
   181        self.proto.serviceKmsKeyName = self.google_cloud_options.dataflow_kms_key
   182  
   183      self.proto.userAgent.additionalProperties.extend([
   184          dataflow.Environment.UserAgentValue.AdditionalProperty(
   185              key='name', value=to_json_value(self._get_python_sdk_name())),
   186          dataflow.Environment.UserAgentValue.AdditionalProperty(
   187              key='version', value=to_json_value(beam_version.__version__))
   188      ])
   189      # Version information.
   190      self.proto.version = dataflow.Environment.VersionValue()
   191      _verify_interpreter_version_is_supported(options)
   192      if self.standard_options.streaming:
   193        job_type = 'FNAPI_STREAMING'
   194      else:
   195        if _is_runner_v2(options):
   196          job_type = 'FNAPI_BATCH'
   197        else:
   198          job_type = 'PYTHON_BATCH'
   199      self.proto.version.additionalProperties.extend([
   200          dataflow.Environment.VersionValue.AdditionalProperty(
   201              key='job_type', value=to_json_value(job_type)),
   202          dataflow.Environment.VersionValue.AdditionalProperty(
   203              key='major', value=to_json_value(environment_version))
   204      ])
   205      # TODO: Use enumerated type instead of strings for job types.
   206      if job_type.startswith('FNAPI_'):
   207        self.debug_options.experiments = self.debug_options.experiments or []
   208  
   209        debug_options_experiments = self.debug_options.experiments
   210        # Add use_multiple_sdk_containers flag if it's not already present. Do not
   211        # add the flag if 'no_use_multiple_sdk_containers' is present.
   212        # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK
   213        # till version 2.4.
   214        if ('use_multiple_sdk_containers' not in debug_options_experiments and
   215            'no_use_multiple_sdk_containers' not in debug_options_experiments):
   216          debug_options_experiments.append('use_multiple_sdk_containers')
   217      # FlexRS
   218      if self.google_cloud_options.flexrs_goal == 'COST_OPTIMIZED':
   219        self.proto.flexResourceSchedulingGoal = (
   220            dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
   221            FLEXRS_COST_OPTIMIZED)
   222      elif self.google_cloud_options.flexrs_goal == 'SPEED_OPTIMIZED':
   223        self.proto.flexResourceSchedulingGoal = (
   224            dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
   225            FLEXRS_SPEED_OPTIMIZED)
   226      # Experiments
   227      if self.debug_options.experiments:
   228        for experiment in self.debug_options.experiments:
   229          self.proto.experiments.append(experiment)
   230      # Worker pool(s) information.
   231      package_descriptors = []
   232      for package in packages:
   233        package_descriptors.append(
   234            dataflow.Package(
   235                location='%s/%s' % (
   236                    self.google_cloud_options.staging_location.replace(
   237                        'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
   238                    package),
   239                name=package))
   240  
   241      pool = dataflow.WorkerPool(
   242          kind='local' if self.local else 'harness',
   243          packages=package_descriptors,
   244          taskrunnerSettings=dataflow.TaskRunnerSettings(
   245              parallelWorkerSettings=dataflow.WorkerSettings(
   246                  baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
   247                  servicePath=self.google_cloud_options.dataflow_endpoint)))
   248  
   249      pool.autoscalingSettings = dataflow.AutoscalingSettings()
   250      # Set worker pool options received through command line.
   251      if self.worker_options.num_workers:
   252        pool.numWorkers = self.worker_options.num_workers
   253      if self.worker_options.max_num_workers:
   254        pool.autoscalingSettings.maxNumWorkers = (
   255            self.worker_options.max_num_workers)
   256      if self.worker_options.autoscaling_algorithm:
   257        values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
   258        pool.autoscalingSettings.algorithm = {
   259            'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
   260            'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
   261        }.get(self.worker_options.autoscaling_algorithm)
   262      if self.worker_options.machine_type:
   263        pool.machineType = self.worker_options.machine_type
   264      if self.worker_options.disk_size_gb:
   265        pool.diskSizeGb = self.worker_options.disk_size_gb
   266      if self.worker_options.disk_type:
   267        pool.diskType = self.worker_options.disk_type
   268      if self.worker_options.zone:
   269        pool.zone = self.worker_options.zone
   270      if self.worker_options.network:
   271        pool.network = self.worker_options.network
   272      if self.worker_options.subnetwork:
   273        pool.subnetwork = self.worker_options.subnetwork
   274  
   275      # Setting worker pool sdk_harness_container_images option for supported
   276      # Dataflow workers.
   277      environments_to_use = self._get_environments_from_tranforms()
   278  
   279      # Adding container images for other SDKs that may be needed for
   280      # cross-language pipelines.
   281      for id, environment in environments_to_use:
   282        if environment.urn != common_urns.environments.DOCKER.urn:
   283          raise Exception(
   284              'Dataflow can only execute pipeline steps in Docker environments.'
   285              ' Received %r.' % environment)
   286        environment_payload = proto_utils.parse_Bytes(
   287            environment.payload, beam_runner_api_pb2.DockerPayload)
   288        container_image_url = environment_payload.container_image
   289  
   290        container_image = dataflow.SdkHarnessContainerImage()
   291        container_image.containerImage = container_image_url
   292        container_image.useSingleCorePerContainer = (
   293            common_urns.protocols.MULTI_CORE_BUNDLE_PROCESSING.urn not in
   294            environment.capabilities)
   295        container_image.environmentId = id
   296        for capability in environment.capabilities:
   297          container_image.capabilities.append(capability)
   298        pool.sdkHarnessContainerImages.append(container_image)
   299  
   300      if not _is_runner_v2(options) or not pool.sdkHarnessContainerImages:
   301        pool.workerHarnessContainerImage = (
   302            get_container_image_from_options(options))
   303      elif len(pool.sdkHarnessContainerImages) == 1:
   304        # Dataflow expects a value here when there is only one environment.
   305        pool.workerHarnessContainerImage = (
   306            pool.sdkHarnessContainerImages[0].containerImage)
   307  
   308      if self.debug_options.number_of_worker_harness_threads:
   309        pool.numThreadsPerWorker = (
   310            self.debug_options.number_of_worker_harness_threads)
   311      if self.worker_options.use_public_ips is not None:
   312        if self.worker_options.use_public_ips:
   313          pool.ipConfiguration = (
   314              dataflow.WorkerPool.IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
   315        else:
   316          pool.ipConfiguration = (
   317              dataflow.WorkerPool.IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE
   318          )
   319  
   320      if self.standard_options.streaming:
   321        # Use separate data disk for streaming.
   322        disk = dataflow.Disk()
   323        if self.local:
   324          disk.diskType = 'local'
   325        if self.worker_options.disk_type:
   326          disk.diskType = self.worker_options.disk_type
   327        pool.dataDisks.append(disk)
   328      self.proto.workerPools.append(pool)
   329  
   330      sdk_pipeline_options = options.get_all_options(retain_unknown_options=True)
   331      if sdk_pipeline_options:
   332        self.proto.sdkPipelineOptions = (
   333            dataflow.Environment.SdkPipelineOptionsValue())
   334  
   335        options_dict = {
   336            k: v
   337            for k, v in sdk_pipeline_options.items() if v is not None
   338        }
   339        options_dict["pipelineUrl"] = proto_pipeline_staged_url
   340        # Don't pass impersonate_service_account through to the harness.
   341        # Though impersonation should start a job, the workers should
   342        # not try to modify their credentials.
   343        options_dict.pop('impersonate_service_account', None)
   344        self.proto.sdkPipelineOptions.additionalProperties.append(
   345            dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
   346                key='options', value=to_json_value(options_dict)))
   347  
   348        dd = DisplayData.create_from_options(options)
   349        items = [item.get_dict() for item in dd.items]
   350        self.proto.sdkPipelineOptions.additionalProperties.append(
   351            dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
   352                key='display_data', value=to_json_value(items)))
   353  
   354      if self.google_cloud_options.dataflow_service_options:
   355        for option in self.google_cloud_options.dataflow_service_options:
   356          self.proto.serviceOptions.append(option)
   357  
   358      if self.google_cloud_options.enable_hot_key_logging:
   359        self.proto.debugOptions = dataflow.DebugOptions(enableHotKeyLogging=True)
   360  
   361    def _get_environments_from_tranforms(self):
   362      if not self._proto_pipeline:
   363        return []
   364  
   365      environment_ids = set(
   366          transform.environment_id
   367          for transform in self._proto_pipeline.components.transforms.values()
   368          if transform.environment_id)
   369  
   370      return [(id, self._proto_pipeline.components.environments[id])
   371              for id in environment_ids]
   372  
   373    def _get_python_sdk_name(self):
   374      python_version = '%d.%d' % (sys.version_info[0], sys.version_info[1])
   375      return 'Apache Beam Python %s SDK' % python_version
   376  
   377  
   378  class Job(object):
   379    """Wrapper for a dataflow Job protobuf."""
   380    def __str__(self):
   381      def encode_shortstrings(input_buffer, errors='strict'):
   382        """Encoder (from Unicode) that suppresses long base64 strings."""
   383        original_len = len(input_buffer)
   384        if original_len > 150:
   385          if self.base64_str_re.match(input_buffer):
   386            input_buffer = '<string of %d bytes>' % original_len
   387            input_buffer = input_buffer.encode('ascii', errors=errors)
   388          else:
   389            matched = self.coder_str_re.match(input_buffer)
   390            if matched:
   391              input_buffer = '%s<string of %d bytes>' % (
   392                  matched.group(1), matched.end(2) - matched.start(2))
   393              input_buffer = input_buffer.encode('ascii', errors=errors)
   394        return input_buffer, original_len
   395  
   396      def decode_shortstrings(input_buffer, errors='strict'):
   397        """Decoder (to Unicode) that suppresses long base64 strings."""
   398        shortened, length = encode_shortstrings(input_buffer, errors)
   399        return str(shortened), length
   400  
   401      def shortstrings_registerer(encoding_name):
   402        if encoding_name == 'shortstrings':
   403          return codecs.CodecInfo(
   404              name='shortstrings',
   405              encode=encode_shortstrings,
   406              decode=decode_shortstrings)
   407        return None
   408  
   409      codecs.register(shortstrings_registerer)
   410  
   411      # Use json "dump string" method to get readable formatting;
   412      # further modify it to not output too-long strings, aimed at the
   413      # 10,000+ character hex-encoded "serialized_fn" values.
   414      return json.dumps(
   415          json.loads(encoding.MessageToJson(self.proto)),
   416          indent=2,
   417          sort_keys=True)
   418  
   419    @staticmethod
   420    def _build_default_job_name(user_name):
   421      """Generates a default name for a job.
   422  
   423      user_name is lowercased, and any characters outside of [-a-z0-9]
   424      are removed. If necessary, the user_name is truncated to shorten
   425      the job name to 63 characters."""
   426      user_name = re.sub('[^-a-z0-9]', '', user_name.lower())
   427      date_component = datetime.utcnow().strftime('%m%d%H%M%S-%f')
   428      app_user_name = 'beamapp-{}'.format(user_name)
   429      # append 8 random alphanumeric characters to avoid collisions.
   430      random_component = ''.join(
   431          random.choices(string.ascii_lowercase + string.digits, k=8))
   432      job_name = '{}-{}-{}'.format(
   433          app_user_name, date_component, random_component)
   434      if len(job_name) > 63:
   435        job_name = '{}-{}-{}'.format(
   436            app_user_name[:-(len(job_name) - 63)],
   437            date_component,
   438            random_component)
   439      return job_name
   440  
   441    @staticmethod
   442    def default_job_name(job_name):
   443      if job_name is None:
   444        job_name = Job._build_default_job_name(getpass.getuser())
   445      return job_name
   446  
   447    def __init__(self, options, proto_pipeline):
   448      self.options = options
   449      validate_pipeline_graph(proto_pipeline)
   450      self.proto_pipeline = proto_pipeline
   451      self.google_cloud_options = options.view_as(GoogleCloudOptions)
   452      if not self.google_cloud_options.job_name:
   453        self.google_cloud_options.job_name = self.default_job_name(
   454            self.google_cloud_options.job_name)
   455  
   456      required_google_cloud_options = ['project', 'job_name', 'temp_location']
   457      missing = [
   458          option for option in required_google_cloud_options
   459          if not getattr(self.google_cloud_options, option)
   460      ]
   461      if missing:
   462        raise ValueError(
   463            'Missing required configuration parameters: %s' % missing)
   464  
   465      if not self.google_cloud_options.staging_location:
   466        _LOGGER.info(
   467            'Defaulting to the temp_location as staging_location: %s',
   468            self.google_cloud_options.temp_location)
   469        (
   470            self.google_cloud_options.staging_location
   471        ) = self.google_cloud_options.temp_location
   472  
   473      self.root_staging_location = self.google_cloud_options.staging_location
   474  
   475      # Make the staging and temp locations job name and time specific. This is
   476      # needed to avoid clashes between job submissions using the same staging
   477      # area or team members using same job names. This method is not entirely
   478      # foolproof since two job submissions with same name can happen at exactly
   479      # the same time. However the window is extremely small given that
   480      # time.time() has at least microseconds granularity. We add the suffix only
   481      # for GCS staging locations where the potential for such clashes is high.
   482      if self.google_cloud_options.staging_location.startswith('gs://'):
   483        path_suffix = '%s.%f' % (self.google_cloud_options.job_name, time.time())
   484        self.google_cloud_options.staging_location = FileSystems.join(
   485            self.google_cloud_options.staging_location, path_suffix)
   486        self.google_cloud_options.temp_location = FileSystems.join(
   487            self.google_cloud_options.temp_location, path_suffix)
   488  
   489      self.proto = dataflow.Job(name=self.google_cloud_options.job_name)
   490      if self.options.view_as(StandardOptions).streaming:
   491        self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING
   492      else:
   493        self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH
   494      if self.google_cloud_options.update:
   495        self.proto.replaceJobId = self.job_id_for_name(self.proto.name)
   496        if self.google_cloud_options.transform_name_mapping:
   497          self.proto.transformNameMapping = (
   498              dataflow.Job.TransformNameMappingValue())
   499          for _, (key, value) in enumerate(
   500              self.google_cloud_options.transform_name_mapping.items()):
   501            self.proto.transformNameMapping.additionalProperties.append(
   502                dataflow.Job.TransformNameMappingValue.AdditionalProperty(
   503                    key=key, value=value))
   504      if self.google_cloud_options.create_from_snapshot:
   505        self.proto.createdFromSnapshotId = (
   506            self.google_cloud_options.create_from_snapshot)
   507      # Labels.
   508      if self.google_cloud_options.labels:
   509        self.proto.labels = dataflow.Job.LabelsValue()
   510        labels = self.google_cloud_options.labels
   511        for label in labels:
   512          if '{' in label:
   513            label = ast.literal_eval(label)
   514            for key, value in label.items():
   515              self.proto.labels.additionalProperties.append(
   516                  dataflow.Job.LabelsValue.AdditionalProperty(
   517                      key=key, value=value))
   518          else:
   519            parts = label.split('=', 1)
   520            key = parts[0]
   521            value = parts[1] if len(parts) > 1 else ''
   522            self.proto.labels.additionalProperties.append(
   523                dataflow.Job.LabelsValue.AdditionalProperty(key=key, value=value))
   524  
   525      # Client Request ID
   526      self.proto.clientRequestId = '{}-{}'.format(
   527          datetime.utcnow().strftime('%Y%m%d%H%M%S%f'),
   528          random.randrange(9000) + 1000)
   529  
   530      self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$')
   531      self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
   532  
   533    def job_id_for_name(self, job_name):
   534      return DataflowApplicationClient(
   535          self.google_cloud_options).job_id_for_name(job_name)
   536  
   537    def json(self):
   538      return encoding.MessageToJson(self.proto)
   539  
   540    def __reduce__(self):
   541      """Reduce hook for pickling the Job class more easily."""
   542      return (Job, (self.options, ))
   543  
   544  
   545  class DataflowApplicationClient(object):
   546    _HASH_CHUNK_SIZE = 1024 * 8
   547    _GCS_CACHE_PREFIX = "artifact_cache"
   548    """A Dataflow API client used by application code to create and query jobs."""
   549    def __init__(self, options, root_staging_location=None):
   550      """Initializes a Dataflow API client object."""
   551      self.standard_options = options.view_as(StandardOptions)
   552      self.google_cloud_options = options.view_as(GoogleCloudOptions)
   553      self._enable_caching = self.google_cloud_options.enable_artifact_caching
   554      self._root_staging_location = (
   555          root_staging_location or self.google_cloud_options.staging_location)
   556  
   557      from apache_beam.runners.dataflow.dataflow_runner import _is_runner_v2
   558      if _is_runner_v2(options):
   559        self.environment_version = _FNAPI_ENVIRONMENT_MAJOR_VERSION
   560      else:
   561        self.environment_version = _LEGACY_ENVIRONMENT_MAJOR_VERSION
   562  
   563      if self.google_cloud_options.no_auth:
   564        credentials = None
   565      else:
   566        credentials = get_service_credentials(options)
   567  
   568      http_client = get_new_http()
   569      self._client = dataflow.DataflowV1b3(
   570          url=self.google_cloud_options.dataflow_endpoint,
   571          credentials=credentials,
   572          get_credentials=(not self.google_cloud_options.no_auth),
   573          http=http_client,
   574          response_encoding=get_response_encoding())
   575      self._storage_client = storage.StorageV1(
   576          url='https://www.googleapis.com/storage/v1',
   577          credentials=credentials,
   578          get_credentials=(not self.google_cloud_options.no_auth),
   579          http=http_client,
   580          response_encoding=get_response_encoding())
   581      self._sdk_image_overrides = self._get_sdk_image_overrides(options)
   582  
   583    def _get_sdk_image_overrides(self, pipeline_options):
   584      worker_options = pipeline_options.view_as(WorkerOptions)
   585      sdk_overrides = worker_options.sdk_harness_container_image_overrides
   586      return (
   587          dict(s.split(',', 1) for s in sdk_overrides) if sdk_overrides else {})
   588  
   589    @staticmethod
   590    def _compute_sha256(file):
   591      hasher = hashlib.sha256()
   592      with open(file, 'rb') as f:
   593        for chunk in iter(partial(f.read,
   594                                  DataflowApplicationClient._HASH_CHUNK_SIZE),
   595                          b""):
   596          hasher.update(chunk)
   597      return hasher.hexdigest()
   598  
   599    def _cached_location(self, sha256):
   600      sha_prefix = sha256[0:2]
   601      return FileSystems.join(
   602          self._root_staging_location,
   603          DataflowApplicationClient._GCS_CACHE_PREFIX,
   604          sha_prefix,
   605          sha256)
   606  
   607    def _gcs_file_copy(self, from_path, to_path, sha256):
   608      if self._enable_caching and sha256:
   609        self._cached_gcs_file_copy(from_path, to_path, sha256)
   610      else:
   611        self._uncached_gcs_file_copy(from_path, to_path)
   612  
   613    def _cached_gcs_file_copy(self, from_path, to_path, sha256):
   614      cached_path = self._cached_location(sha256)
   615      if FileSystems.exists(cached_path):
   616        _LOGGER.info(
   617            'Skipping upload of %s because it already exists at %s',
   618            to_path,
   619            cached_path)
   620      else:
   621        self._uncached_gcs_file_copy(from_path, cached_path)
   622  
   623      FileSystems.copy(
   624          source_file_names=[cached_path], destination_file_names=[to_path])
   625      _LOGGER.info('Copied cached artifact from %s to %s', from_path, to_path)
   626  
   627    @retry.with_exponential_backoff(
   628        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   629    def _uncached_gcs_file_copy(self, from_path, to_path):
   630      to_folder, to_name = os.path.split(to_path)
   631      total_size = os.path.getsize(from_path)
   632      with open(from_path, 'rb') as f:
   633        self.stage_file(to_folder, to_name, f, total_size=total_size)
   634  
   635    def _stage_resources(self, pipeline, options):
   636      google_cloud_options = options.view_as(GoogleCloudOptions)
   637      if google_cloud_options.staging_location is None:
   638        raise RuntimeError('The --staging_location option must be specified.')
   639      if google_cloud_options.temp_location is None:
   640        raise RuntimeError('The --temp_location option must be specified.')
   641  
   642      resources = []
   643      staged_paths = {}
   644      staged_hashes = {}
   645      for _, env in sorted(pipeline.components.environments.items(),
   646                           key=lambda kv: kv[0]):
   647        for dep in env.dependencies:
   648          if dep.type_urn != common_urns.artifact_types.FILE.urn:
   649            raise RuntimeError('unsupported artifact type %s' % dep.type_urn)
   650          type_payload = beam_runner_api_pb2.ArtifactFilePayload.FromString(
   651              dep.type_payload)
   652  
   653          if dep.role_urn == common_urns.artifact_roles.STAGING_TO.urn:
   654            remote_name = (
   655                beam_runner_api_pb2.ArtifactStagingToRolePayload.FromString(
   656                    dep.role_payload)).staged_name
   657            is_staged_role = True
   658          else:
   659            remote_name = os.path.basename(type_payload.path)
   660            is_staged_role = False
   661  
   662          if self._enable_caching and not type_payload.sha256:
   663            type_payload.sha256 = self._compute_sha256(type_payload.path)
   664  
   665          if type_payload.sha256 and type_payload.sha256 in staged_hashes:
   666            _LOGGER.info(
   667                'Found duplicated artifact sha256: %s (%s)',
   668                type_payload.path,
   669                type_payload.sha256)
   670            remote_name = staged_hashes[type_payload.sha256]
   671            if is_staged_role:
   672              # We should not be overriding this, as dep.role_payload.staged_name
   673              # refers to the desired name on the worker, whereas staged_name
   674              # refers to its placement in a distributed filesystem.
   675              # TODO(heejong): Clean this up.
   676              dep.role_payload = beam_runner_api_pb2.ArtifactStagingToRolePayload(
   677                  staged_name=remote_name).SerializeToString()
   678          elif type_payload.path and type_payload.path in staged_paths:
   679            _LOGGER.info(
   680                'Found duplicated artifact path: %s (%s)',
   681                type_payload.path,
   682                type_payload.sha256)
   683            remote_name = staged_paths[type_payload.path]
   684            if is_staged_role:
   685              # We should not be overriding this, as dep.role_payload.staged_name
   686              # refers to the desired name on the worker, whereas staged_name
   687              # refers to its placement in a distributed filesystem.
   688              # TODO(heejong): Clean this up.
   689              dep.role_payload = beam_runner_api_pb2.ArtifactStagingToRolePayload(
   690                  staged_name=remote_name).SerializeToString()
   691          else:
   692            resources.append(
   693                (type_payload.path, remote_name, type_payload.sha256))
   694            staged_paths[type_payload.path] = remote_name
   695            staged_hashes[type_payload.sha256] = remote_name
   696  
   697          if FileSystems.get_scheme(
   698              google_cloud_options.staging_location) == GCSFileSystem.scheme():
   699            dep.type_urn = common_urns.artifact_types.URL.urn
   700            dep.type_payload = beam_runner_api_pb2.ArtifactUrlPayload(
   701                url=FileSystems.join(
   702                    google_cloud_options.staging_location, remote_name),
   703                sha256=type_payload.sha256).SerializeToString()
   704          else:
   705            dep.type_payload = beam_runner_api_pb2.ArtifactFilePayload(
   706                path=FileSystems.join(
   707                    google_cloud_options.staging_location, remote_name),
   708                sha256=type_payload.sha256).SerializeToString()
   709  
   710      resource_stager = _LegacyDataflowStager(self)
   711      staged_resources = resource_stager.stage_job_resources(
   712          resources, staging_location=google_cloud_options.staging_location)
   713      return staged_resources
   714  
   715    def stage_file(
   716        self,
   717        gcs_or_local_path,
   718        file_name,
   719        stream,
   720        mime_type='application/octet-stream',
   721        total_size=None):
   722      """Stages a file at a GCS or local path with stream-supplied contents."""
   723      if not gcs_or_local_path.startswith('gs://'):
   724        local_path = FileSystems.join(gcs_or_local_path, file_name)
   725        _LOGGER.info('Staging file locally to %s', local_path)
   726        with open(local_path, 'wb') as f:
   727          f.write(stream.read())
   728        return
   729      gcs_location = FileSystems.join(gcs_or_local_path, file_name)
   730      bucket, name = gcs_location[5:].split('/', 1)
   731  
   732      request = storage.StorageObjectsInsertRequest(bucket=bucket, name=name)
   733      start_time = time.time()
   734      _LOGGER.info('Starting GCS upload to %s...', gcs_location)
   735      upload = storage.Upload(stream, mime_type, total_size)
   736      try:
   737        response = self._storage_client.objects.Insert(request, upload=upload)
   738      except exceptions.HttpError as e:
   739        reportable_errors = {
   740            403: 'access denied',
   741            404: 'bucket not found',
   742        }
   743        if e.status_code in reportable_errors:
   744          raise IOError((
   745              'Could not upload to GCS path %s: %s. Please verify '
   746              'that credentials are valid and that you have write '
   747              'access to the specified path.') %
   748                        (gcs_or_local_path, reportable_errors[e.status_code]))
   749        raise
   750      _LOGGER.info(
   751          'Completed GCS upload to %s in %s seconds.',
   752          gcs_location,
   753          int(time.time() - start_time))
   754      return response
   755  
   756    @retry.no_retries  # Using no_retries marks this as an integration point.
   757    def create_job(self, job):
   758      """Creates job description. May stage and/or submit for remote execution."""
   759      self.create_job_description(job)
   760  
   761      # Stage and submit the job when necessary
   762      dataflow_job_file = job.options.view_as(DebugOptions).dataflow_job_file
   763      template_location = (
   764          job.options.view_as(GoogleCloudOptions).template_location)
   765  
   766      if job.options.view_as(DebugOptions).lookup_experiment('upload_graph'):
   767        self.stage_file(
   768            job.options.view_as(GoogleCloudOptions).staging_location,
   769            "dataflow_graph.json",
   770            io.BytesIO(job.json().encode('utf-8')))
   771        del job.proto.steps[:]
   772        job.proto.stepsLocation = FileSystems.join(
   773            job.options.view_as(GoogleCloudOptions).staging_location,
   774            "dataflow_graph.json")
   775  
   776      # template file generation should be placed immediately before the
   777      # conditional API call.
   778      job_location = template_location or dataflow_job_file
   779      if job_location:
   780        gcs_or_local_path = os.path.dirname(job_location)
   781        file_name = os.path.basename(job_location)
   782        self.stage_file(
   783            gcs_or_local_path, file_name, io.BytesIO(job.json().encode('utf-8')))
   784  
   785      if not template_location:
   786        return self.submit_job_description(job)
   787  
   788      _LOGGER.info(
   789          'A template was just created at location %s', template_location)
   790      return None
   791  
   792    @staticmethod
   793    def _update_container_image_for_dataflow(beam_container_image_url):
   794      # By default Dataflow pipelines use containers hosted in Dataflow GCR
   795      # instead of Docker Hub.
   796      image_suffix = beam_container_image_url.rsplit('/', 1)[1]
   797      return names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/' + image_suffix
   798  
   799    @staticmethod
   800    def _apply_sdk_environment_overrides(
   801        proto_pipeline, sdk_overrides, pipeline_options):
   802      # Updates container image URLs for Dataflow.
   803      # For a given container image URL
   804      # * If a matching override has been provided that will be used.
   805      # * For improved performance, External Apache Beam container images that are
   806      #   not explicitly overridden will be
   807      #   updated to use GCR copies instead of directly downloading from the
   808      #   Docker Hub.
   809  
   810      current_sdk_container_image = get_container_image_from_options(
   811          pipeline_options)
   812  
   813      for environment in proto_pipeline.components.environments.values():
   814        docker_payload = proto_utils.parse_Bytes(
   815            environment.payload, beam_runner_api_pb2.DockerPayload)
   816        overridden = False
   817        new_container_image = docker_payload.container_image
   818        for pattern, override in sdk_overrides.items():
   819          new_container_image = re.sub(pattern, override, new_container_image)
   820          if new_container_image != docker_payload.container_image:
   821            overridden = True
   822  
   823        # Container of the current (Python) SDK is overridden separately, hence
   824        # not updated here.
   825        if (is_apache_beam_container(new_container_image) and not overridden and
   826            new_container_image != current_sdk_container_image):
   827          new_container_image = (
   828              DataflowApplicationClient._update_container_image_for_dataflow(
   829                  docker_payload.container_image))
   830  
   831        if not new_container_image:
   832          raise ValueError(
   833              'SDK Docker container image has to be a non-empty string')
   834  
   835        new_payload = copy(docker_payload)
   836        new_payload.container_image = new_container_image
   837        environment.payload = new_payload.SerializeToString()
   838  
   839    def create_job_description(self, job):
   840      """Creates a job described by the workflow proto."""
   841      DataflowApplicationClient._apply_sdk_environment_overrides(
   842          job.proto_pipeline, self._sdk_image_overrides, job.options)
   843  
   844      # Stage other resources for the SDK harness
   845      resources = self._stage_resources(job.proto_pipeline, job.options)
   846  
   847      # Stage proto pipeline.
   848      self.stage_file(
   849          job.google_cloud_options.staging_location,
   850          shared_names.STAGED_PIPELINE_FILENAME,
   851          io.BytesIO(job.proto_pipeline.SerializeToString()))
   852  
   853      job.proto.environment = Environment(
   854          proto_pipeline_staged_url=FileSystems.join(
   855              job.google_cloud_options.staging_location,
   856              shared_names.STAGED_PIPELINE_FILENAME),
   857          packages=resources,
   858          options=job.options,
   859          environment_version=self.environment_version,
   860          proto_pipeline=job.proto_pipeline).proto
   861      _LOGGER.debug('JOB: %s', job)
   862  
   863    @retry.with_exponential_backoff(num_retries=3, initial_delay_secs=3)
   864    def get_job_metrics(self, job_id):
   865      request = dataflow.DataflowProjectsLocationsJobsGetMetricsRequest()
   866      request.jobId = job_id
   867      request.location = self.google_cloud_options.region
   868      request.projectId = self.google_cloud_options.project
   869      try:
   870        response = self._client.projects_locations_jobs.GetMetrics(request)
   871      except exceptions.BadStatusCodeError as e:
   872        _LOGGER.error(
   873            'HTTP status %d. Unable to query metrics', e.response.status)
   874        raise
   875      return response
   876  
   877    @retry.with_exponential_backoff(num_retries=3)
   878    def submit_job_description(self, job):
   879      """Creates and excutes a job request."""
   880      request = dataflow.DataflowProjectsLocationsJobsCreateRequest()
   881      request.projectId = self.google_cloud_options.project
   882      request.location = self.google_cloud_options.region
   883      request.job = job.proto
   884  
   885      try:
   886        response = self._client.projects_locations_jobs.Create(request)
   887      except exceptions.BadStatusCodeError as e:
   888        _LOGGER.error(
   889            'HTTP status %d trying to create job'
   890            ' at dataflow service endpoint %s',
   891            e.response.status,
   892            self.google_cloud_options.dataflow_endpoint)
   893        _LOGGER.fatal('details of server error: %s', e)
   894        raise
   895  
   896      if response.clientRequestId and \
   897          response.clientRequestId != job.proto.clientRequestId:
   898        if self.google_cloud_options.update:
   899          raise DataflowJobAlreadyExistsError(
   900              "The job named %s with id: %s has already been updated into job "
   901              "id: %s and cannot be updated again." %
   902              (response.name, job.proto.replaceJobId, response.id))
   903        else:
   904          raise DataflowJobAlreadyExistsError(
   905              'There is already active job named %s with id: %s. If you want to '
   906              'submit a second job, try again by setting a different name using '
   907              '--job_name.' % (response.name, response.id))
   908  
   909      _LOGGER.info('Create job: %s', response)
   910      # The response is a Job proto with the id for the new job.
   911      _LOGGER.info('Created job with id: [%s]', response.id)
   912      _LOGGER.info('Submitted job: %s', response.id)
   913      _LOGGER.info(
   914          'To access the Dataflow monitoring console, please navigate to '
   915          'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s',
   916          self.google_cloud_options.region,
   917          response.id,
   918          self.google_cloud_options.project)
   919  
   920      return response
   921  
   922    @retry.with_exponential_backoff()  # Using retry defaults from utils/retry.py
   923    def modify_job_state(self, job_id, new_state):
   924      """Modify the run state of the job.
   925  
   926      Args:
   927        job_id: The id of the job.
   928        new_state: A string representing the new desired state. It could be set to
   929        either 'JOB_STATE_DONE', 'JOB_STATE_CANCELLED' or 'JOB_STATE_DRAINING'.
   930  
   931      Returns:
   932        True if the job was modified successfully.
   933      """
   934      if new_state == 'JOB_STATE_DONE':
   935        new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DONE
   936      elif new_state == 'JOB_STATE_CANCELLED':
   937        new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_CANCELLED
   938      elif new_state == 'JOB_STATE_DRAINING':
   939        new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DRAINING
   940      else:
   941        # Other states could only be set by the service.
   942        return False
   943  
   944      request = dataflow.DataflowProjectsLocationsJobsUpdateRequest()
   945      request.jobId = job_id
   946      request.projectId = self.google_cloud_options.project
   947      request.location = self.google_cloud_options.region
   948      request.job = dataflow.Job(requestedState=new_state)
   949  
   950      self._client.projects_locations_jobs.Update(request)
   951      return True
   952  
   953    @retry.with_exponential_backoff(
   954        retry_filter=retry.retry_on_server_errors_and_notfound_filter)
   955    def get_job(self, job_id):
   956      """Gets the job status for a submitted job.
   957  
   958      Args:
   959        job_id: A string representing the job_id for the workflow as returned
   960          by the create_job() request.
   961  
   962      Returns:
   963        A Job proto. See below for interesting fields.
   964  
   965      The Job proto returned from a get_job() request contains some interesting
   966      fields:
   967        currentState: An object representing the current state of the job. The
   968          string representation of the object (str() result) has the following
   969          possible values: JOB_STATE_UNKNONW, JOB_STATE_STOPPED,
   970          JOB_STATE_RUNNING, JOB_STATE_DONE, JOB_STATE_FAILED,
   971          JOB_STATE_CANCELLED.
   972        createTime: UTC time when the job was created
   973          (e.g. '2015-03-10T00:01:53.074Z')
   974        currentStateTime: UTC time for the current state of the job.
   975      """
   976      request = dataflow.DataflowProjectsLocationsJobsGetRequest()
   977      request.jobId = job_id
   978      request.projectId = self.google_cloud_options.project
   979      request.location = self.google_cloud_options.region
   980      response = self._client.projects_locations_jobs.Get(request)
   981      return response
   982  
   983    @retry.with_exponential_backoff(
   984        retry_filter=retry.retry_on_server_errors_and_notfound_filter)
   985    def list_messages(
   986        self,
   987        job_id,
   988        start_time=None,
   989        end_time=None,
   990        page_token=None,
   991        minimum_importance=None):
   992      """List messages associated with the execution of a job.
   993  
   994      Args:
   995        job_id: A string representing the job_id for the workflow as returned
   996          by the create_job() request.
   997        start_time: If specified, only messages generated after the start time
   998          will be returned, otherwise all messages since job started will be
   999          returned. The value is a string representing UTC time
  1000          (e.g., '2015-08-18T21:03:50.644Z')
  1001        end_time: If specified, only messages generated before the end time
  1002          will be returned, otherwise all messages up to current time will be
  1003          returned. The value is a string representing UTC time
  1004          (e.g., '2015-08-18T21:03:50.644Z')
  1005        page_token: A string to be used as next page token if the list call
  1006          returned paginated results.
  1007        minimum_importance: Filter for messages based on importance. The possible
  1008          string values in increasing order of importance are: JOB_MESSAGE_DEBUG,
  1009          JOB_MESSAGE_DETAILED, JOB_MESSAGE_BASIC, JOB_MESSAGE_WARNING,
  1010          JOB_MESSAGE_ERROR. For example, a filter set on warning will allow only
  1011          warnings and errors and exclude all others.
  1012  
  1013      Returns:
  1014        A tuple consisting of a list of JobMessage instances and a
  1015        next page token string.
  1016  
  1017      Raises:
  1018        RuntimeError: if an unexpected value for the message_importance argument
  1019          is used.
  1020  
  1021      The JobMessage objects returned by the call contain the following  fields:
  1022        id: A unique string identifier for the message.
  1023        time: A string representing the UTC time of the message
  1024          (e.g., '2015-08-18T21:03:50.644Z')
  1025        messageImportance: An enumeration value for the message importance. The
  1026          value if converted to string will have the following possible values:
  1027          JOB_MESSAGE_DEBUG, JOB_MESSAGE_DETAILED, JOB_MESSAGE_BASIC,
  1028          JOB_MESSAGE_WARNING, JOB_MESSAGE_ERROR.
  1029       messageText: A message string.
  1030      """
  1031      request = dataflow.DataflowProjectsLocationsJobsMessagesListRequest(
  1032          jobId=job_id,
  1033          location=self.google_cloud_options.region,
  1034          projectId=self.google_cloud_options.project)
  1035      if page_token is not None:
  1036        request.pageToken = page_token
  1037      if start_time is not None:
  1038        request.startTime = start_time
  1039      if end_time is not None:
  1040        request.endTime = end_time
  1041      if minimum_importance is not None:
  1042        if minimum_importance == 'JOB_MESSAGE_DEBUG':
  1043          request.minimumImportance = (
  1044              dataflow.DataflowProjectsLocationsJobsMessagesListRequest.
  1045              MinimumImportanceValueValuesEnum.JOB_MESSAGE_DEBUG)
  1046        elif minimum_importance == 'JOB_MESSAGE_DETAILED':
  1047          request.minimumImportance = (
  1048              dataflow.DataflowProjectsLocationsJobsMessagesListRequest.
  1049              MinimumImportanceValueValuesEnum.JOB_MESSAGE_DETAILED)
  1050        elif minimum_importance == 'JOB_MESSAGE_BASIC':
  1051          request.minimumImportance = (
  1052              dataflow.DataflowProjectsLocationsJobsMessagesListRequest.
  1053              MinimumImportanceValueValuesEnum.JOB_MESSAGE_BASIC)
  1054        elif minimum_importance == 'JOB_MESSAGE_WARNING':
  1055          request.minimumImportance = (
  1056              dataflow.DataflowProjectsLocationsJobsMessagesListRequest.
  1057              MinimumImportanceValueValuesEnum.JOB_MESSAGE_WARNING)
  1058        elif minimum_importance == 'JOB_MESSAGE_ERROR':
  1059          request.minimumImportance = (
  1060              dataflow.DataflowProjectsLocationsJobsMessagesListRequest.
  1061              MinimumImportanceValueValuesEnum.JOB_MESSAGE_ERROR)
  1062        else:
  1063          raise RuntimeError(
  1064              'Unexpected value for minimum_importance argument: %r' %
  1065              minimum_importance)
  1066      response = self._client.projects_locations_jobs_messages.List(request)
  1067      return response.jobMessages, response.nextPageToken
  1068  
  1069    def job_id_for_name(self, job_name):
  1070      token = None
  1071      while True:
  1072        request = dataflow.DataflowProjectsLocationsJobsListRequest(
  1073            projectId=self.google_cloud_options.project,
  1074            location=self.google_cloud_options.region,
  1075            pageToken=token)
  1076        response = self._client.projects_locations_jobs.List(request)
  1077        for job in response.jobs:
  1078          if (job.name == job_name and job.currentState in [
  1079              dataflow.Job.CurrentStateValueValuesEnum.JOB_STATE_RUNNING,
  1080              dataflow.Job.CurrentStateValueValuesEnum.JOB_STATE_DRAINING
  1081          ]):
  1082            return job.id
  1083        token = response.nextPageToken
  1084        if token is None:
  1085          raise ValueError("No running job found with name '%s'" % job_name)
  1086  
  1087  
  1088  class MetricUpdateTranslators(object):
  1089    """Translators between accumulators and dataflow metric updates."""
  1090    @staticmethod
  1091    def translate_boolean(accumulator, metric_update_proto):
  1092      metric_update_proto.boolean = accumulator.value
  1093  
  1094    @staticmethod
  1095    def translate_scalar_mean_int(accumulator, metric_update_proto):
  1096      if accumulator.count:
  1097        metric_update_proto.integerMean = dataflow.IntegerMean()
  1098        metric_update_proto.integerMean.sum = to_split_int(accumulator.sum)
  1099        metric_update_proto.integerMean.count = to_split_int(accumulator.count)
  1100      else:
  1101        metric_update_proto.nameAndKind.kind = None
  1102  
  1103    @staticmethod
  1104    def translate_scalar_mean_float(accumulator, metric_update_proto):
  1105      if accumulator.count:
  1106        metric_update_proto.floatingPointMean = dataflow.FloatingPointMean()
  1107        metric_update_proto.floatingPointMean.sum = accumulator.sum
  1108        metric_update_proto.floatingPointMean.count = to_split_int(
  1109            accumulator.count)
  1110      else:
  1111        metric_update_proto.nameAndKind.kind = None
  1112  
  1113    @staticmethod
  1114    def translate_scalar_counter_int(accumulator, metric_update_proto):
  1115      metric_update_proto.integer = to_split_int(accumulator.value)
  1116  
  1117    @staticmethod
  1118    def translate_scalar_counter_float(accumulator, metric_update_proto):
  1119      metric_update_proto.floatingPoint = accumulator.value
  1120  
  1121  
  1122  class _LegacyDataflowStager(Stager):
  1123    def __init__(self, dataflow_application_client):
  1124      super().__init__()
  1125      self._dataflow_application_client = dataflow_application_client
  1126  
  1127    def stage_artifact(self, local_path_to_artifact, artifact_name, sha256):
  1128      self._dataflow_application_client._gcs_file_copy(
  1129          local_path_to_artifact, artifact_name, sha256)
  1130  
  1131    def commit_manifest(self):
  1132      pass
  1133  
  1134    @staticmethod
  1135    def get_sdk_package_name():
  1136      """For internal use only; no backwards-compatibility guarantees.
  1137  
  1138            Returns the PyPI package name to be staged to Google Cloud Dataflow.
  1139      """
  1140      return shared_names.BEAM_PACKAGE_NAME
  1141  
  1142  
  1143  class DataflowJobAlreadyExistsError(retry.PermanentException):
  1144    """A non-retryable exception that a job with the given name already exists."""
  1145    # Inherits retry.PermanentException to avoid retry in
  1146    # DataflowApplicationClient.submit_job_description
  1147    pass
  1148  
  1149  
  1150  def to_split_int(n):
  1151    res = dataflow.SplitInt64()
  1152    res.lowBits = n & 0xffffffff
  1153    res.highBits = n >> 32
  1154    return res
  1155  
  1156  
  1157  # TODO: Used in legacy batch worker. Move under MetricUpdateTranslators
  1158  # after Runner V2 transition.
  1159  def translate_distribution(distribution_update, metric_update_proto):
  1160    """Translate metrics DistributionUpdate to dataflow distribution update.
  1161  
  1162    Args:
  1163      distribution_update: Instance of DistributionData,
  1164      DistributionInt64Accumulator or DataflowDistributionCounter.
  1165      metric_update_proto: Used for report metrics.
  1166    """
  1167    dist_update_proto = dataflow.DistributionUpdate()
  1168    dist_update_proto.min = to_split_int(distribution_update.min)
  1169    dist_update_proto.max = to_split_int(distribution_update.max)
  1170    dist_update_proto.count = to_split_int(distribution_update.count)
  1171    dist_update_proto.sum = to_split_int(distribution_update.sum)
  1172    # DataflowDistributionCounter needs to translate histogram
  1173    if isinstance(distribution_update, DataflowDistributionCounter):
  1174      dist_update_proto.histogram = dataflow.Histogram()
  1175      distribution_update.translate_to_histogram(dist_update_proto.histogram)
  1176    metric_update_proto.distribution = dist_update_proto
  1177  
  1178  
  1179  # TODO: Used in legacy batch worker. Delete after Runner V2 transition.
  1180  def translate_value(value, metric_update_proto):
  1181    metric_update_proto.integer = to_split_int(value)
  1182  
  1183  
  1184  def _get_container_image_tag():
  1185    base_version = pkg_resources.parse_version(
  1186        beam_version.__version__).base_version
  1187    if base_version != beam_version.__version__:
  1188      warnings.warn(
  1189          "A non-standard version of Beam SDK detected: %s. "
  1190          "Dataflow runner will use container image tag %s. "
  1191          "This use case is not supported." %
  1192          (beam_version.__version__, base_version))
  1193    return base_version
  1194  
  1195  
  1196  def get_container_image_from_options(pipeline_options):
  1197    """For internal use only; no backwards-compatibility guarantees.
  1198  
  1199      Args:
  1200        pipeline_options (PipelineOptions): A container for pipeline options.
  1201  
  1202      Returns:
  1203        str: Container image for remote execution.
  1204    """
  1205    from apache_beam.runners.dataflow.dataflow_runner import _is_runner_v2
  1206    worker_options = pipeline_options.view_as(WorkerOptions)
  1207    if worker_options.sdk_container_image:
  1208      return worker_options.sdk_container_image
  1209  
  1210    is_runner_v2 = _is_runner_v2(pipeline_options)
  1211  
  1212    # Legacy and runner v2 exist in different repositories.
  1213    # Set to legacy format, override if runner v2
  1214    container_repo = names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY
  1215    image_name = '{repository}/python{major}{minor}'.format(
  1216        repository=container_repo,
  1217        major=sys.version_info[0],
  1218        minor=sys.version_info[1])
  1219  
  1220    if is_runner_v2:
  1221      image_name = '{repository}/beam_python{major}.{minor}_sdk'.format(
  1222          repository=container_repo,
  1223          major=sys.version_info[0],
  1224          minor=sys.version_info[1])
  1225  
  1226    image_tag = _get_required_container_version(is_runner_v2)
  1227    return image_name + ':' + image_tag
  1228  
  1229  
  1230  def _get_required_container_version(is_runner_v2):
  1231    """For internal use only; no backwards-compatibility guarantees.
  1232  
  1233      Args:
  1234        is_runner_v2 (bool): True if and only if pipeline is using runner v2.
  1235  
  1236      Returns:
  1237        str: The tag of worker container images in GCR that corresponds to
  1238          current version of the SDK.
  1239      """
  1240    if 'dev' in beam_version.__version__:
  1241      if is_runner_v2:
  1242        return names.BEAM_FNAPI_CONTAINER_VERSION
  1243      else:
  1244        return names.BEAM_CONTAINER_VERSION
  1245    else:
  1246      return _get_container_image_tag()
  1247  
  1248  
  1249  def get_response_encoding():
  1250    """Encoding to use to decode HTTP response from Google APIs."""
  1251    return 'utf8'
  1252  
  1253  
  1254  def _verify_interpreter_version_is_supported(pipeline_options):
  1255    if ('%s.%s' %
  1256        (sys.version_info[0],
  1257         sys.version_info[1]) in _PYTHON_VERSIONS_SUPPORTED_BY_DATAFLOW):
  1258      return
  1259  
  1260    if 'dev' in beam_version.__version__:
  1261      return
  1262  
  1263    debug_options = pipeline_options.view_as(DebugOptions)
  1264    if (debug_options.experiments and
  1265        'use_unsupported_python_version' in debug_options.experiments):
  1266      return
  1267  
  1268    raise Exception(
  1269        'Dataflow runner currently supports Python versions %s, got %s.\n'
  1270        'To ignore this requirement and start a job '
  1271        'using an unsupported version of Python interpreter, pass '
  1272        '--experiment use_unsupported_python_version pipeline option.' %
  1273        (_PYTHON_VERSIONS_SUPPORTED_BY_DATAFLOW, sys.version))
  1274  
  1275  
  1276  # To enable a counter on the service, add it to this dictionary.
  1277  # This is required for the legacy python dataflow runner, as portability
  1278  # does not communicate to the service via python code, but instead via a
  1279  # a runner harness (in C++ or Java).
  1280  # TODO(https://github.com/apache/beam/issues/19433) : Remove this antipattern,
  1281  # legacy dataflow python pipelines will break whenever a new cy_combiner type
  1282  # is used.
  1283  structured_counter_translations = {
  1284      cy_combiners.CountCombineFn: (
  1285          dataflow.CounterMetadata.KindValueValuesEnum.SUM,
  1286          MetricUpdateTranslators.translate_scalar_counter_int),
  1287      cy_combiners.SumInt64Fn: (
  1288          dataflow.CounterMetadata.KindValueValuesEnum.SUM,
  1289          MetricUpdateTranslators.translate_scalar_counter_int),
  1290      cy_combiners.MinInt64Fn: (
  1291          dataflow.CounterMetadata.KindValueValuesEnum.MIN,
  1292          MetricUpdateTranslators.translate_scalar_counter_int),
  1293      cy_combiners.MaxInt64Fn: (
  1294          dataflow.CounterMetadata.KindValueValuesEnum.MAX,
  1295          MetricUpdateTranslators.translate_scalar_counter_int),
  1296      cy_combiners.MeanInt64Fn: (
  1297          dataflow.CounterMetadata.KindValueValuesEnum.MEAN,
  1298          MetricUpdateTranslators.translate_scalar_mean_int),
  1299      cy_combiners.SumFloatFn: (
  1300          dataflow.CounterMetadata.KindValueValuesEnum.SUM,
  1301          MetricUpdateTranslators.translate_scalar_counter_float),
  1302      cy_combiners.MinFloatFn: (
  1303          dataflow.CounterMetadata.KindValueValuesEnum.MIN,
  1304          MetricUpdateTranslators.translate_scalar_counter_float),
  1305      cy_combiners.MaxFloatFn: (
  1306          dataflow.CounterMetadata.KindValueValuesEnum.MAX,
  1307          MetricUpdateTranslators.translate_scalar_counter_float),
  1308      cy_combiners.MeanFloatFn: (
  1309          dataflow.CounterMetadata.KindValueValuesEnum.MEAN,
  1310          MetricUpdateTranslators.translate_scalar_mean_float),
  1311      cy_combiners.AllCombineFn: (
  1312          dataflow.CounterMetadata.KindValueValuesEnum.AND,
  1313          MetricUpdateTranslators.translate_boolean),
  1314      cy_combiners.AnyCombineFn: (
  1315          dataflow.CounterMetadata.KindValueValuesEnum.OR,
  1316          MetricUpdateTranslators.translate_boolean),
  1317      cy_combiners.DataflowDistributionCounterFn: (
  1318          dataflow.CounterMetadata.KindValueValuesEnum.DISTRIBUTION,
  1319          translate_distribution),
  1320      cy_combiners.DistributionInt64Fn: (
  1321          dataflow.CounterMetadata.KindValueValuesEnum.DISTRIBUTION,
  1322          translate_distribution),
  1323  }
  1324  
  1325  counter_translations = {
  1326      cy_combiners.CountCombineFn: (
  1327          dataflow.NameAndKind.KindValueValuesEnum.SUM,
  1328          MetricUpdateTranslators.translate_scalar_counter_int),
  1329      cy_combiners.SumInt64Fn: (
  1330          dataflow.NameAndKind.KindValueValuesEnum.SUM,
  1331          MetricUpdateTranslators.translate_scalar_counter_int),
  1332      cy_combiners.MinInt64Fn: (
  1333          dataflow.NameAndKind.KindValueValuesEnum.MIN,
  1334          MetricUpdateTranslators.translate_scalar_counter_int),
  1335      cy_combiners.MaxInt64Fn: (
  1336          dataflow.NameAndKind.KindValueValuesEnum.MAX,
  1337          MetricUpdateTranslators.translate_scalar_counter_int),
  1338      cy_combiners.MeanInt64Fn: (
  1339          dataflow.NameAndKind.KindValueValuesEnum.MEAN,
  1340          MetricUpdateTranslators.translate_scalar_mean_int),
  1341      cy_combiners.SumFloatFn: (
  1342          dataflow.NameAndKind.KindValueValuesEnum.SUM,
  1343          MetricUpdateTranslators.translate_scalar_counter_float),
  1344      cy_combiners.MinFloatFn: (
  1345          dataflow.NameAndKind.KindValueValuesEnum.MIN,
  1346          MetricUpdateTranslators.translate_scalar_counter_float),
  1347      cy_combiners.MaxFloatFn: (
  1348          dataflow.NameAndKind.KindValueValuesEnum.MAX,
  1349          MetricUpdateTranslators.translate_scalar_counter_float),
  1350      cy_combiners.MeanFloatFn: (
  1351          dataflow.NameAndKind.KindValueValuesEnum.MEAN,
  1352          MetricUpdateTranslators.translate_scalar_mean_float),
  1353      cy_combiners.AllCombineFn: (
  1354          dataflow.NameAndKind.KindValueValuesEnum.AND,
  1355          MetricUpdateTranslators.translate_boolean),
  1356      cy_combiners.AnyCombineFn: (
  1357          dataflow.NameAndKind.KindValueValuesEnum.OR,
  1358          MetricUpdateTranslators.translate_boolean),
  1359      cy_combiners.DataflowDistributionCounterFn: (
  1360          dataflow.NameAndKind.KindValueValuesEnum.DISTRIBUTION,
  1361          translate_distribution),
  1362      cy_combiners.DistributionInt64Fn: (
  1363          dataflow.CounterMetadata.KindValueValuesEnum.DISTRIBUTION,
  1364          translate_distribution),
  1365  }