github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/dataproc/dataproc_cluster_manager.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/dataproc/dataproc_cluster_manager.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  
    20  import logging
    21  import re
    22  import time
    23  from typing import Optional
    24  from typing import Tuple
    25  
    26  from apache_beam import version as beam_version
    27  from apache_beam.options.pipeline_options import PipelineOptions
    28  from apache_beam.runners.interactive import interactive_environment as ie
    29  from apache_beam.runners.interactive.dataproc.types import ClusterMetadata
    30  from apache_beam.runners.interactive.utils import obfuscate
    31  from apache_beam.runners.interactive.utils import progress_indicated
    32  
    33  try:
    34    from google.cloud import dataproc_v1
    35    from apache_beam.io.gcp import gcsfilesystem  #pylint: disable=ungrouped-imports
    36  except ImportError:
    37  
    38    class UnimportedDataproc:
    39      Cluster = None
    40  
    41    dataproc_v1 = UnimportedDataproc()
    42  
    43  _LOGGER = logging.getLogger(__name__)
    44  
    45  # Name of the log file auto-generated by Dataproc. We use it to locate the
    46  # startup output of the Flink daemon to retrieve master url and dashboard
    47  # information.
    48  DATAPROC_STAGING_LOG_NAME = 'dataproc-initialization-script-0_output'
    49  
    50  # Home dir of os user yarn.
    51  YARN_HOME = '/var/lib/hadoop-yarn'
    52  
    53  # Configures the os user yarn to use gcloud as the docker credHelper.
    54  # Also sets some taskmanager configurations for better parallelism.
    55  # Finally starts the yarn application: flink cluster in session mode.
    56  INIT_ACTION = """#!/bin/bash
    57  sudo -u yarn gcloud auth configure-docker --quiet
    58  
    59  readonly FLINK_INSTALL_DIR='/usr/lib/flink'
    60  readonly MASTER_HOSTNAME="$(/usr/share/google/get_metadata_value attributes/dataproc-master)"
    61  
    62  cat <<EOF >>${FLINK_INSTALL_DIR}/conf/flink-conf.yaml
    63  taskmanager.memory.network.fraction: 0.2
    64  taskmanager.memory.network.min: 64mb
    65  taskmanager.memory.network.max: 1gb
    66  EOF
    67  sed -i \
    68      "s/^taskmanager.network.numberOfBuffers: 2048/taskmanager.network.numberOfBuffers: 8192/" \
    69      ${FLINK_INSTALL_DIR}/conf/flink-conf.yaml
    70  
    71  if [[ "${HOSTNAME}" == "${MASTER_HOSTNAME}" ]]; then
    72    . /usr/bin/flink-yarn-daemon
    73  fi
    74  """
    75  
    76  
    77  class DataprocClusterManager:
    78    """Self-contained cluster manager that controls the lifecyle of a Dataproc
    79    cluster connected by one or more pipelines under Interactive Beam.
    80    """
    81    def __init__(self, cluster_metadata: ClusterMetadata) -> None:
    82      """Initializes the DataprocClusterManager with properties required
    83      to interface with the Dataproc ClusterControllerClient.
    84      """
    85      self.cluster_metadata = cluster_metadata
    86      # Pipelines whose jobs are executed on the cluster.
    87      self.pipelines = set()
    88      self._cluster_client = dataproc_v1.ClusterControllerClient(
    89          client_options={
    90              'api_endpoint': \
    91              f'{self.cluster_metadata.region}-dataproc.googleapis.com:443'
    92          })
    93      self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions())
    94      self._staging_directory = None
    95      cache_dir = ie.current_env().options.cache_root
    96      if not cache_dir.startswith('gs://'):
    97        error_msg = (
    98            'ib.options.cache_root needs to be a Cloud Storage '
    99            'Bucket to cache source recording and PCollections in current '
   100            f'interactive setup, instead \'{cache_dir}\' is assigned.')
   101        _LOGGER.error(error_msg)
   102        raise ValueError(error_msg)
   103      self._cache_root = cache_dir.rstrip('/')
   104  
   105    def stage_init_action(self) -> str:
   106      """Stages the initialization action script to GCS cache root to set up
   107      Dataproc clusters.
   108  
   109      Returns the staged gcs file path.
   110      """
   111      # Versionizes the initialization action script.
   112      init_action_ver = obfuscate(INIT_ACTION)
   113      path = f'{self._cache_root}/dataproc-init-action-{init_action_ver}.sh'
   114      if not self._fs.exists(path):
   115        with self._fs.create(path) as bwriter:
   116          bwriter.write(INIT_ACTION.encode())
   117      return path
   118  
   119    @progress_indicated
   120    def create_cluster(self, cluster: dict) -> None:
   121      """Attempts to create a cluster using attributes that were
   122      initialized with the DataprocClusterManager instance.
   123  
   124      Args:
   125        cluster: Dictionary representing Dataproc cluster. Read more about the
   126            schema for clusters here:
   127            https://cloud.google.com/python/docs/reference/dataproc/latest/google.cloud.dataproc_v1.types.Cluster
   128      """
   129      if self.cluster_metadata.master_url:
   130        return
   131      try:
   132        self._cluster_client.create_cluster(
   133            request={
   134                'project_id': self.cluster_metadata.project_id,
   135                'region': self.cluster_metadata.region,
   136                'cluster': cluster
   137            })
   138      except Exception as e:
   139        if e.code == 409:
   140          _LOGGER.info(
   141              'Cluster %s already exists. Continuing...',
   142              self.cluster_metadata.cluster_name)
   143        elif e.code == 403:
   144          _LOGGER.error(
   145              'Due to insufficient project permissions, '
   146              'unable to create cluster: %s',
   147              self.cluster_metadata.cluster_name)
   148          raise ValueError(
   149              'You cannot create a cluster in project: {}'.format(
   150                  self.cluster_metadata.project_id))
   151        elif e.code == 501:
   152          _LOGGER.error(
   153              'Invalid region provided: %s', self.cluster_metadata.region)
   154          raise ValueError(
   155              'Region {} does not exist!'.format(self.cluster_metadata.region))
   156        else:
   157          _LOGGER.error(
   158              'Unable to create cluster: %s', self.cluster_metadata.cluster_name)
   159          raise e
   160      else:
   161        _LOGGER.info(
   162            'Cluster created successfully: %s',
   163            self.cluster_metadata.cluster_name)
   164        self._staging_directory = self.get_staging_location()
   165        master_url, dashboard = self.get_master_url_and_dashboard()
   166        self.cluster_metadata.master_url = master_url
   167        self.cluster_metadata.dashboard = dashboard
   168  
   169    def create_flink_cluster(self) -> None:
   170      """Calls _create_cluster with a configuration that enables FlinkRunner."""
   171      init_action_path = self.stage_init_action()
   172      cluster = {
   173          'project_id': self.cluster_metadata.project_id,
   174          'cluster_name': self.cluster_metadata.cluster_name,
   175          'config': {
   176              'software_config': {
   177                  # TODO(https://github.com/apache/beam/issues/21527): Uncomment
   178                  # these lines when a Dataproc image is released with previously
   179                  # missing dependencies.
   180                  # 'image_version': ie.current_env().clusters.
   181                  # DATAPROC_IMAGE_VERSION,
   182                  'optional_components': ['DOCKER', 'FLINK'],
   183                  'properties': {
   184                      # Enforces HOME dir for user yarn.
   185                      'yarn:yarn.nodemanager.user-home-dir': YARN_HOME
   186                  }
   187              },
   188              'initialization_actions': [{
   189                  'executable_file': init_action_path
   190              }],
   191              'gce_cluster_config': {
   192                  'metadata': {
   193                      'flink-start-yarn-session': 'false'
   194                  },
   195                  'service_account_scopes': [
   196                      'https://www.googleapis.com/auth/cloud-platform'
   197                  ]
   198              },
   199              'master_config': {
   200                  # There must be 1 and only 1 instance of master.
   201                  'num_instances': 1
   202              },
   203              'worker_config': {},
   204              'endpoint_config': {
   205                  'enable_http_port_access': True
   206              }
   207          },
   208          'labels': {
   209              'goog-dataflow-notebook': beam_version.__version__.replace(
   210                  '.', '_')
   211          }
   212      }
   213  
   214      # Additional gce_cluster_config.
   215      gce_cluster_config = cluster['config']['gce_cluster_config']
   216      if self.cluster_metadata.subnetwork:
   217        gce_cluster_config['subnetwork_uri'] = self.cluster_metadata.subnetwork
   218  
   219      # Additional InstanceGroupConfig for master and workers.
   220      master_config = cluster['config']['master_config']
   221      worker_config = cluster['config']['worker_config']
   222      if self.cluster_metadata.num_workers:
   223        worker_config['num_instances'] = self.cluster_metadata.num_workers
   224      if self.cluster_metadata.machine_type:
   225        master_config['machine_type_uri'] = self.cluster_metadata.machine_type
   226        worker_config['machine_type_uri'] = self.cluster_metadata.machine_type
   227  
   228      self.create_cluster(cluster)
   229  
   230    def cleanup(self) -> None:
   231      """Deletes the cluster that uses the attributes initialized
   232      with the DataprocClusterManager instance."""
   233      try:
   234        self._cluster_client.delete_cluster(
   235            request={
   236                'project_id': self.cluster_metadata.project_id,
   237                'region': self.cluster_metadata.region,
   238                'cluster_name': self.cluster_metadata.cluster_name,
   239            })
   240        self.cleanup_staging_files()
   241      except Exception as e:
   242        if e.code == 403:
   243          _LOGGER.error(
   244              'Due to insufficient project permissions, '
   245              'unable to clean up the default cluster: %s',
   246              self.cluster_metadata.cluster_name)
   247          raise ValueError(
   248              'You cannot delete a cluster in project: {}'.format(
   249                  self.cluster_metadata.project_id))
   250        elif e.code == 404:
   251          _LOGGER.error(
   252              'Cluster does not exist: %s', self.cluster_metadata.cluster_name)
   253          raise ValueError(
   254              'Cluster was not found: {}'.format(
   255                  self.cluster_metadata.cluster_name))
   256        else:
   257          _LOGGER.error(
   258              'Failed to delete cluster: %s', self.cluster_metadata.cluster_name)
   259          raise e
   260  
   261    def get_cluster_details(self) -> dataproc_v1.Cluster:
   262      """Gets the Dataproc_v1 Cluster object for the current cluster manager."""
   263      try:
   264        return self._cluster_client.get_cluster(
   265            request={
   266                'project_id': self.cluster_metadata.project_id,
   267                'region': self.cluster_metadata.region,
   268                'cluster_name': self.cluster_metadata.cluster_name
   269            })
   270      except Exception as e:
   271        if e.code == 403:
   272          _LOGGER.error(
   273              'Due to insufficient project permissions, '
   274              'unable to retrieve information for cluster: %s',
   275              self.cluster_metadata.cluster_name)
   276          raise ValueError(
   277              'You cannot view clusters in project: {}'.format(
   278                  self.cluster_metadata.project_id))
   279        elif e.code == 404:
   280          _LOGGER.error(
   281              'Cluster does not exist: %s', self.cluster_metadata.cluster_name)
   282          raise ValueError(
   283              'Cluster was not found: {}'.format(
   284                  self.cluster_metadata.cluster_name))
   285        else:
   286          _LOGGER.error(
   287              'Failed to get information for cluster: %s',
   288              self.cluster_metadata.cluster_name)
   289          raise e
   290  
   291    def wait_for_cluster_to_provision(self) -> None:
   292      while self.get_cluster_details().status.state.name == 'CREATING':
   293        time.sleep(15)
   294  
   295    def get_staging_location(self) -> str:
   296      """Gets the staging bucket of an existing Dataproc cluster."""
   297      try:
   298        self.wait_for_cluster_to_provision()
   299        cluster_details = self.get_cluster_details()
   300        bucket_name = cluster_details.config.config_bucket
   301        gcs_path = 'gs://' + bucket_name + '/google-cloud-dataproc-metainfo/'
   302        for file in self._fs._list(gcs_path):
   303          if self.cluster_metadata.cluster_name in file.path:
   304            # this file path split will look something like:
   305            # ['gs://.../google-cloud-dataproc-metainfo/{staging_dir}/',
   306            # '-{node-type}/dataproc-initialization-script-0_output']
   307            return file.path.split(self.cluster_metadata.cluster_name)[0]
   308      except Exception as e:
   309        _LOGGER.error(
   310            'Failed to get %s cluster staging bucket.',
   311            self.cluster_metadata.cluster_name)
   312        raise e
   313  
   314    def parse_master_url_and_dashboard(self, line: str) -> Tuple[str, str]:
   315      """Parses the master_url and YARN application_id of the Flink process from
   316      an input line. The line containing both the master_url and application id
   317      is always formatted as such:
   318      {text} Found Web Interface {master_url} of application
   319      '{application_id}'.\\n
   320  
   321      Truncated example where '...' represents additional text between segments:
   322      ... google-dataproc-startup[000]: ... activate-component-flink[0000]:
   323      ...org.apache.flink.yarn.YarnClusterDescriptor... [] -
   324      Found Web Interface example-master-url:50000 of application
   325      'application_123456789000_0001'.
   326  
   327      Returns the flink_master_url and dashboard link as a tuple."""
   328      cluster_details = self.get_cluster_details()
   329      yarn_endpoint = cluster_details.config.endpoint_config.http_ports[
   330          'YARN ResourceManager']
   331      segment = line.split('Found Web Interface ')[1].split(' of application ')
   332      master_url = segment[0]
   333      application_id = re.sub('\'|.\n', '', segment[1])
   334      dashboard = re.sub(
   335          '/yarn/',
   336          '/gateway/default/yarn/proxy/' + application_id + '/',
   337          yarn_endpoint)
   338      return master_url, dashboard
   339  
   340    def get_master_url_and_dashboard(self) -> Tuple[Optional[str], Optional[str]]:
   341      """Returns the master_url of the current cluster."""
   342      startup_logs = []
   343      for file in self._fs._list(self._staging_directory):
   344        if DATAPROC_STAGING_LOG_NAME in file.path:
   345          startup_logs.append(file.path)
   346  
   347      for log in startup_logs:
   348        content = self._fs.open(log)
   349        for line in content.readlines():
   350          decoded_line = line.decode()
   351          if 'Found Web Interface' in decoded_line:
   352            return self.parse_master_url_and_dashboard(decoded_line)
   353      return None, None
   354  
   355    def cleanup_staging_files(self) -> None:
   356      if self._staging_directory:
   357        staging_files = [
   358            file.path for file in self._fs._list(self._staging_directory)
   359        ]
   360        self._fs.delete(staging_files)
   361      if self._cache_root:
   362        cache_files = [file.path for file in self._fs._list(self._cache_root)]
   363        self._fs.delete(cache_files)