github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/dataproc/dataproc_cluster_manager.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # pytype: skip-file 19 20 import logging 21 import re 22 import time 23 from typing import Optional 24 from typing import Tuple 25 26 from apache_beam import version as beam_version 27 from apache_beam.options.pipeline_options import PipelineOptions 28 from apache_beam.runners.interactive import interactive_environment as ie 29 from apache_beam.runners.interactive.dataproc.types import ClusterMetadata 30 from apache_beam.runners.interactive.utils import obfuscate 31 from apache_beam.runners.interactive.utils import progress_indicated 32 33 try: 34 from google.cloud import dataproc_v1 35 from apache_beam.io.gcp import gcsfilesystem #pylint: disable=ungrouped-imports 36 except ImportError: 37 38 class UnimportedDataproc: 39 Cluster = None 40 41 dataproc_v1 = UnimportedDataproc() 42 43 _LOGGER = logging.getLogger(__name__) 44 45 # Name of the log file auto-generated by Dataproc. We use it to locate the 46 # startup output of the Flink daemon to retrieve master url and dashboard 47 # information. 48 DATAPROC_STAGING_LOG_NAME = 'dataproc-initialization-script-0_output' 49 50 # Home dir of os user yarn. 51 YARN_HOME = '/var/lib/hadoop-yarn' 52 53 # Configures the os user yarn to use gcloud as the docker credHelper. 54 # Also sets some taskmanager configurations for better parallelism. 55 # Finally starts the yarn application: flink cluster in session mode. 56 INIT_ACTION = """#!/bin/bash 57 sudo -u yarn gcloud auth configure-docker --quiet 58 59 readonly FLINK_INSTALL_DIR='/usr/lib/flink' 60 readonly MASTER_HOSTNAME="$(/usr/share/google/get_metadata_value attributes/dataproc-master)" 61 62 cat <<EOF >>${FLINK_INSTALL_DIR}/conf/flink-conf.yaml 63 taskmanager.memory.network.fraction: 0.2 64 taskmanager.memory.network.min: 64mb 65 taskmanager.memory.network.max: 1gb 66 EOF 67 sed -i \ 68 "s/^taskmanager.network.numberOfBuffers: 2048/taskmanager.network.numberOfBuffers: 8192/" \ 69 ${FLINK_INSTALL_DIR}/conf/flink-conf.yaml 70 71 if [[ "${HOSTNAME}" == "${MASTER_HOSTNAME}" ]]; then 72 . /usr/bin/flink-yarn-daemon 73 fi 74 """ 75 76 77 class DataprocClusterManager: 78 """Self-contained cluster manager that controls the lifecyle of a Dataproc 79 cluster connected by one or more pipelines under Interactive Beam. 80 """ 81 def __init__(self, cluster_metadata: ClusterMetadata) -> None: 82 """Initializes the DataprocClusterManager with properties required 83 to interface with the Dataproc ClusterControllerClient. 84 """ 85 self.cluster_metadata = cluster_metadata 86 # Pipelines whose jobs are executed on the cluster. 87 self.pipelines = set() 88 self._cluster_client = dataproc_v1.ClusterControllerClient( 89 client_options={ 90 'api_endpoint': \ 91 f'{self.cluster_metadata.region}-dataproc.googleapis.com:443' 92 }) 93 self._fs = gcsfilesystem.GCSFileSystem(PipelineOptions()) 94 self._staging_directory = None 95 cache_dir = ie.current_env().options.cache_root 96 if not cache_dir.startswith('gs://'): 97 error_msg = ( 98 'ib.options.cache_root needs to be a Cloud Storage ' 99 'Bucket to cache source recording and PCollections in current ' 100 f'interactive setup, instead \'{cache_dir}\' is assigned.') 101 _LOGGER.error(error_msg) 102 raise ValueError(error_msg) 103 self._cache_root = cache_dir.rstrip('/') 104 105 def stage_init_action(self) -> str: 106 """Stages the initialization action script to GCS cache root to set up 107 Dataproc clusters. 108 109 Returns the staged gcs file path. 110 """ 111 # Versionizes the initialization action script. 112 init_action_ver = obfuscate(INIT_ACTION) 113 path = f'{self._cache_root}/dataproc-init-action-{init_action_ver}.sh' 114 if not self._fs.exists(path): 115 with self._fs.create(path) as bwriter: 116 bwriter.write(INIT_ACTION.encode()) 117 return path 118 119 @progress_indicated 120 def create_cluster(self, cluster: dict) -> None: 121 """Attempts to create a cluster using attributes that were 122 initialized with the DataprocClusterManager instance. 123 124 Args: 125 cluster: Dictionary representing Dataproc cluster. Read more about the 126 schema for clusters here: 127 https://cloud.google.com/python/docs/reference/dataproc/latest/google.cloud.dataproc_v1.types.Cluster 128 """ 129 if self.cluster_metadata.master_url: 130 return 131 try: 132 self._cluster_client.create_cluster( 133 request={ 134 'project_id': self.cluster_metadata.project_id, 135 'region': self.cluster_metadata.region, 136 'cluster': cluster 137 }) 138 except Exception as e: 139 if e.code == 409: 140 _LOGGER.info( 141 'Cluster %s already exists. Continuing...', 142 self.cluster_metadata.cluster_name) 143 elif e.code == 403: 144 _LOGGER.error( 145 'Due to insufficient project permissions, ' 146 'unable to create cluster: %s', 147 self.cluster_metadata.cluster_name) 148 raise ValueError( 149 'You cannot create a cluster in project: {}'.format( 150 self.cluster_metadata.project_id)) 151 elif e.code == 501: 152 _LOGGER.error( 153 'Invalid region provided: %s', self.cluster_metadata.region) 154 raise ValueError( 155 'Region {} does not exist!'.format(self.cluster_metadata.region)) 156 else: 157 _LOGGER.error( 158 'Unable to create cluster: %s', self.cluster_metadata.cluster_name) 159 raise e 160 else: 161 _LOGGER.info( 162 'Cluster created successfully: %s', 163 self.cluster_metadata.cluster_name) 164 self._staging_directory = self.get_staging_location() 165 master_url, dashboard = self.get_master_url_and_dashboard() 166 self.cluster_metadata.master_url = master_url 167 self.cluster_metadata.dashboard = dashboard 168 169 def create_flink_cluster(self) -> None: 170 """Calls _create_cluster with a configuration that enables FlinkRunner.""" 171 init_action_path = self.stage_init_action() 172 cluster = { 173 'project_id': self.cluster_metadata.project_id, 174 'cluster_name': self.cluster_metadata.cluster_name, 175 'config': { 176 'software_config': { 177 # TODO(https://github.com/apache/beam/issues/21527): Uncomment 178 # these lines when a Dataproc image is released with previously 179 # missing dependencies. 180 # 'image_version': ie.current_env().clusters. 181 # DATAPROC_IMAGE_VERSION, 182 'optional_components': ['DOCKER', 'FLINK'], 183 'properties': { 184 # Enforces HOME dir for user yarn. 185 'yarn:yarn.nodemanager.user-home-dir': YARN_HOME 186 } 187 }, 188 'initialization_actions': [{ 189 'executable_file': init_action_path 190 }], 191 'gce_cluster_config': { 192 'metadata': { 193 'flink-start-yarn-session': 'false' 194 }, 195 'service_account_scopes': [ 196 'https://www.googleapis.com/auth/cloud-platform' 197 ] 198 }, 199 'master_config': { 200 # There must be 1 and only 1 instance of master. 201 'num_instances': 1 202 }, 203 'worker_config': {}, 204 'endpoint_config': { 205 'enable_http_port_access': True 206 } 207 }, 208 'labels': { 209 'goog-dataflow-notebook': beam_version.__version__.replace( 210 '.', '_') 211 } 212 } 213 214 # Additional gce_cluster_config. 215 gce_cluster_config = cluster['config']['gce_cluster_config'] 216 if self.cluster_metadata.subnetwork: 217 gce_cluster_config['subnetwork_uri'] = self.cluster_metadata.subnetwork 218 219 # Additional InstanceGroupConfig for master and workers. 220 master_config = cluster['config']['master_config'] 221 worker_config = cluster['config']['worker_config'] 222 if self.cluster_metadata.num_workers: 223 worker_config['num_instances'] = self.cluster_metadata.num_workers 224 if self.cluster_metadata.machine_type: 225 master_config['machine_type_uri'] = self.cluster_metadata.machine_type 226 worker_config['machine_type_uri'] = self.cluster_metadata.machine_type 227 228 self.create_cluster(cluster) 229 230 def cleanup(self) -> None: 231 """Deletes the cluster that uses the attributes initialized 232 with the DataprocClusterManager instance.""" 233 try: 234 self._cluster_client.delete_cluster( 235 request={ 236 'project_id': self.cluster_metadata.project_id, 237 'region': self.cluster_metadata.region, 238 'cluster_name': self.cluster_metadata.cluster_name, 239 }) 240 self.cleanup_staging_files() 241 except Exception as e: 242 if e.code == 403: 243 _LOGGER.error( 244 'Due to insufficient project permissions, ' 245 'unable to clean up the default cluster: %s', 246 self.cluster_metadata.cluster_name) 247 raise ValueError( 248 'You cannot delete a cluster in project: {}'.format( 249 self.cluster_metadata.project_id)) 250 elif e.code == 404: 251 _LOGGER.error( 252 'Cluster does not exist: %s', self.cluster_metadata.cluster_name) 253 raise ValueError( 254 'Cluster was not found: {}'.format( 255 self.cluster_metadata.cluster_name)) 256 else: 257 _LOGGER.error( 258 'Failed to delete cluster: %s', self.cluster_metadata.cluster_name) 259 raise e 260 261 def get_cluster_details(self) -> dataproc_v1.Cluster: 262 """Gets the Dataproc_v1 Cluster object for the current cluster manager.""" 263 try: 264 return self._cluster_client.get_cluster( 265 request={ 266 'project_id': self.cluster_metadata.project_id, 267 'region': self.cluster_metadata.region, 268 'cluster_name': self.cluster_metadata.cluster_name 269 }) 270 except Exception as e: 271 if e.code == 403: 272 _LOGGER.error( 273 'Due to insufficient project permissions, ' 274 'unable to retrieve information for cluster: %s', 275 self.cluster_metadata.cluster_name) 276 raise ValueError( 277 'You cannot view clusters in project: {}'.format( 278 self.cluster_metadata.project_id)) 279 elif e.code == 404: 280 _LOGGER.error( 281 'Cluster does not exist: %s', self.cluster_metadata.cluster_name) 282 raise ValueError( 283 'Cluster was not found: {}'.format( 284 self.cluster_metadata.cluster_name)) 285 else: 286 _LOGGER.error( 287 'Failed to get information for cluster: %s', 288 self.cluster_metadata.cluster_name) 289 raise e 290 291 def wait_for_cluster_to_provision(self) -> None: 292 while self.get_cluster_details().status.state.name == 'CREATING': 293 time.sleep(15) 294 295 def get_staging_location(self) -> str: 296 """Gets the staging bucket of an existing Dataproc cluster.""" 297 try: 298 self.wait_for_cluster_to_provision() 299 cluster_details = self.get_cluster_details() 300 bucket_name = cluster_details.config.config_bucket 301 gcs_path = 'gs://' + bucket_name + '/google-cloud-dataproc-metainfo/' 302 for file in self._fs._list(gcs_path): 303 if self.cluster_metadata.cluster_name in file.path: 304 # this file path split will look something like: 305 # ['gs://.../google-cloud-dataproc-metainfo/{staging_dir}/', 306 # '-{node-type}/dataproc-initialization-script-0_output'] 307 return file.path.split(self.cluster_metadata.cluster_name)[0] 308 except Exception as e: 309 _LOGGER.error( 310 'Failed to get %s cluster staging bucket.', 311 self.cluster_metadata.cluster_name) 312 raise e 313 314 def parse_master_url_and_dashboard(self, line: str) -> Tuple[str, str]: 315 """Parses the master_url and YARN application_id of the Flink process from 316 an input line. The line containing both the master_url and application id 317 is always formatted as such: 318 {text} Found Web Interface {master_url} of application 319 '{application_id}'.\\n 320 321 Truncated example where '...' represents additional text between segments: 322 ... google-dataproc-startup[000]: ... activate-component-flink[0000]: 323 ...org.apache.flink.yarn.YarnClusterDescriptor... [] - 324 Found Web Interface example-master-url:50000 of application 325 'application_123456789000_0001'. 326 327 Returns the flink_master_url and dashboard link as a tuple.""" 328 cluster_details = self.get_cluster_details() 329 yarn_endpoint = cluster_details.config.endpoint_config.http_ports[ 330 'YARN ResourceManager'] 331 segment = line.split('Found Web Interface ')[1].split(' of application ') 332 master_url = segment[0] 333 application_id = re.sub('\'|.\n', '', segment[1]) 334 dashboard = re.sub( 335 '/yarn/', 336 '/gateway/default/yarn/proxy/' + application_id + '/', 337 yarn_endpoint) 338 return master_url, dashboard 339 340 def get_master_url_and_dashboard(self) -> Tuple[Optional[str], Optional[str]]: 341 """Returns the master_url of the current cluster.""" 342 startup_logs = [] 343 for file in self._fs._list(self._staging_directory): 344 if DATAPROC_STAGING_LOG_NAME in file.path: 345 startup_logs.append(file.path) 346 347 for log in startup_logs: 348 content = self._fs.open(log) 349 for line in content.readlines(): 350 decoded_line = line.decode() 351 if 'Found Web Interface' in decoded_line: 352 return self.parse_master_url_and_dashboard(decoded_line) 353 return None, None 354 355 def cleanup_staging_files(self) -> None: 356 if self._staging_directory: 357 staging_files = [ 358 file.path for file in self._fs._list(self._staging_directory) 359 ] 360 self._fs.delete(staging_files) 361 if self._cache_root: 362 cache_files = [file.path for file in self._fs._list(self._cache_root)] 363 self._fs.delete(cache_files)