github.com/apache/beam/sdks/v2@v2.48.2/python/gen_protos.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Generates Python proto modules and grpc stubs for Beam protos. 20 """ 21 22 import contextlib 23 import glob 24 import inspect 25 import logging 26 import os 27 import platform 28 import re 29 import shutil 30 import subprocess 31 import sys 32 import time 33 from collections import defaultdict 34 from importlib import import_module 35 36 import pkg_resources 37 38 LOG = logging.getLogger() 39 LOG.setLevel(logging.INFO) 40 41 LICENSE_HEADER = """ 42 # 43 # Licensed to the Apache Software Foundation (ASF) under one or more 44 # contributor license agreements. See the NOTICE file distributed with 45 # this work for additional information regarding copyright ownership. 46 # The ASF licenses this file to You under the Apache License, Version 2.0 47 # (the "License"); you may not use this file except in compliance with 48 # the License. You may obtain a copy of the License at 49 # 50 # http://www.apache.org/licenses/LICENSE-2.0 51 # 52 # Unless required by applicable law or agreed to in writing, software 53 # distributed under the License is distributed on an "AS IS" BASIS, 54 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 55 # See the License for the specific language governing permissions and 56 # limitations under the License. 57 # 58 """ 59 60 NO_PROMISES_NOTICE = """ 61 \"\"\" 62 For internal use only; no backwards-compatibility guarantees. 63 Automatically generated when running setup.py sdist or build[_py]. 64 \"\"\" 65 """ 66 67 68 def clean_path(path): 69 return os.path.realpath(os.path.abspath(path)) 70 71 72 # These paths are relative to the project root 73 BEAM_PROTO_PATHS = [ 74 os.path.join('model', 'pipeline', 'src', 'main', 'proto'), 75 os.path.join('model', 'job-management', 'src', 'main', 'proto'), 76 os.path.join('model', 'fn-execution', 'src', 'main', 'proto'), 77 os.path.join('model', 'interactive', 'src', 'main', 'proto'), 78 ] 79 80 PYTHON_SDK_ROOT = os.path.dirname(clean_path(__file__)) 81 PROJECT_ROOT = clean_path(os.path.join(PYTHON_SDK_ROOT, '..', '..')) 82 PYTHON_OUTPUT_PATH = os.path.join( 83 PYTHON_SDK_ROOT, 'apache_beam', 'portability', 'api') 84 85 MODEL_RESOURCES = [ 86 os.path.normpath(( 87 'model/fn-execution/src/main/resources/org/' 88 'apache/beam/model/fnexecution/v1/standard_coders.yaml')), 89 ] 90 91 92 class PythonPath(object): 93 def __init__(self, path: str, front: bool = False): 94 self._path = path 95 self._front = front 96 97 def __enter__(self): 98 if not self._path: 99 return 100 101 self._sys_path = sys.path.copy() 102 if self._front: 103 sys.path.insert(0, self._path) 104 else: 105 sys.path.append(self._path) 106 107 def __exit__(self, exc_type, exc_val, exc_tb): 108 if not self._path: 109 return 110 111 sys.path = self._sys_path 112 113 114 def generate_urn_files(out_dir, api_path): 115 """ 116 Create python files with statically defined URN constants. 117 118 Creates a <proto>_pb2_urn.py file for each <proto>_pb2.py file that contains 119 an enum type. 120 121 This works by importing each api.<proto>_pb2 module created by `protoc`, 122 inspecting the module's contents, and generating a new side-car urn module. 123 This is executed at build time rather than dynamically on import to ensure 124 that it is compatible with static type checkers like mypy. 125 """ 126 from google.protobuf import message 127 from google.protobuf.internal import api_implementation 128 if api_implementation.Type() == 'python': 129 from google.protobuf.internal import containers 130 repeated_types = ( 131 list, 132 containers.RepeatedScalarFieldContainer, 133 containers.RepeatedCompositeFieldContainer) 134 elif api_implementation.Type() == 'upb': 135 from google._upb import _message 136 repeated_types = ( 137 list, 138 _message.RepeatedScalarContainer, 139 _message.RepeatedCompositeContainer) 140 elif api_implementation.Type() == 'cpp': 141 from google.protobuf.pyext import _message 142 repeated_types = ( 143 list, 144 _message.RepeatedScalarContainer, 145 _message.RepeatedCompositeContainer) 146 else: 147 raise TypeError( 148 "Unknown proto implementation: " + api_implementation.Type()) 149 150 class Context(object): 151 INDENT = ' ' 152 CAP_SPLIT = re.compile('([A-Z][^A-Z]*|^[a-z]+)') 153 154 def __init__(self, indent=0): 155 self.lines = [] 156 self.imports = set() 157 self.empty_types = set() 158 self._indent = indent 159 160 @contextlib.contextmanager 161 def indent(self): 162 self._indent += 1 163 yield 164 self._indent -= 1 165 166 def prepend(self, s): 167 if s: 168 self.lines.insert(0, (self.INDENT * self._indent) + s + '\n') 169 else: 170 self.lines.insert(0, '\n') 171 172 def line(self, s): 173 if s: 174 self.lines.append((self.INDENT * self._indent) + s + '\n') 175 else: 176 self.lines.append('\n') 177 178 def import_type(self, typ): 179 modname = typ.__module__ 180 if modname in ('__builtin__', 'builtin'): 181 return typ.__name__ 182 else: 183 self.imports.add(modname) 184 _, modname = modname.rsplit('.', 1) 185 return modname + '.' + typ.__name__ 186 187 @staticmethod 188 def is_message_type(obj): 189 return isinstance(obj, type) and \ 190 issubclass(obj, message.Message) 191 192 @staticmethod 193 def is_enum_type(obj): 194 return type(obj).__name__ == 'EnumTypeWrapper' 195 196 def python_repr(self, obj): 197 if isinstance(obj, message.Message): 198 return self.message_repr(obj) 199 elif isinstance(obj, repeated_types): 200 return '[%s]' % ', '.join(self.python_repr(x) for x in obj) 201 else: 202 return repr(obj) 203 204 def empty_type(self, typ): 205 name = ( 206 'EMPTY_' + 207 '_'.join(x.upper() for x in self.CAP_SPLIT.findall(typ.__name__))) 208 self.empty_types.add('%s = %s()' % (name, self.import_type(typ))) 209 return name 210 211 def message_repr(self, msg): 212 parts = [] 213 for field, value in msg.ListFields(): 214 parts.append('%s=%s' % (field.name, self.python_repr(value))) 215 if parts: 216 return '%s(%s)' % (self.import_type(type(msg)), ', '.join(parts)) 217 else: 218 return self.empty_type(type(msg)) 219 220 def write_enum(self, enum_name, enum, indent): 221 ctx = Context(indent=indent) 222 with ctx.indent(): 223 for enum_value_name in enum.values_by_name: 224 enum_value_descriptor = enum.values_by_name[enum_value_name] 225 extensions = enum_value_descriptor.GetOptions().Extensions 226 prop = ( 227 extensions[beam_runner_api_pb2.beam_urn], 228 extensions[beam_runner_api_pb2.beam_constant], 229 extensions[metrics_pb2.monitoring_info_spec], 230 extensions[metrics_pb2.label_props], 231 ) 232 reprs = [self.python_repr(x) for x in prop] 233 if all(x == "''" or x.startswith('EMPTY_') for x in reprs): 234 continue 235 ctx.line( 236 '%s = PropertiesFromEnumValue(%s)' % 237 (enum_value_name, ', '.join(self.python_repr(x) for x in prop))) 238 239 if ctx.lines: 240 ctx.prepend('class %s(object):' % enum_name) 241 ctx.prepend('') 242 ctx.line('') 243 return ctx.lines 244 245 def write_message(self, message_name, message, indent=0): 246 ctx = Context(indent=indent) 247 248 with ctx.indent(): 249 for obj_name, obj in inspect.getmembers(message): 250 if obj_name == 'DESCRIPTOR': 251 for enum_name in obj.enum_types_by_name: 252 enum = obj.enum_types_by_name[enum_name] 253 ctx.lines += self.write_enum(enum_name, enum, ctx._indent) 254 255 if ctx.lines: 256 ctx.prepend('class %s(object):' % message_name) 257 ctx.prepend('') 258 return ctx.lines 259 260 pb2_files = list(glob.glob(os.path.join(out_dir, '*_pb2.py'))) 261 262 with PythonPath(os.path.dirname(api_path), front=True): 263 beam_runner_api_pb2 = import_module( 264 'api.org.apache.beam.model.pipeline.v1.beam_runner_api_pb2') 265 metrics_pb2 = import_module( 266 'api.org.apache.beam.model.pipeline.v1.metrics_pb2') 267 268 for pb2_file in pb2_files: 269 modname = os.path.splitext(pb2_file)[0] 270 out_file = modname + '_urns.py' 271 api_start_idx = modname.index(os.path.sep + 'api' + os.path.sep) 272 import_path = modname[api_start_idx + 1:].replace(os.path.sep, '.') 273 mod = import_module(import_path) 274 275 ctx = Context() 276 for obj_name, obj in inspect.getmembers(mod): 277 if ctx.is_message_type(obj): 278 ctx.lines += ctx.write_message(obj_name, obj) 279 280 if ctx.lines: 281 for line in reversed(sorted(ctx.empty_types)): 282 ctx.prepend(line) 283 284 for modname in reversed(sorted(ctx.imports)): 285 pkg, target = modname.rsplit('.', 1) 286 rel_import = build_relative_import(api_path, pkg, out_file) 287 ctx.prepend('from %s import %s' % (rel_import, target)) 288 289 rel_import = build_relative_import( 290 os.path.dirname(api_path), 'utils', out_file) 291 ctx.prepend('from %s import PropertiesFromEnumValue' % rel_import) 292 293 LOG.info("Writing urn stubs: %s" % out_file) 294 with open(out_file, 'w') as f: 295 f.writelines(ctx.lines) 296 297 298 def _find_protoc_gen_mypy(): 299 # NOTE: this shouldn't be necessary if the virtualenv's environment 300 # is passed to tasks below it, since protoc will search the PATH itself 301 fname = 'protoc-gen-mypy' 302 if platform.system() == 'Windows': 303 fname += ".exe" 304 305 pathstr = os.environ.get('PATH') 306 search_paths = pathstr.split(os.pathsep) if pathstr else [] 307 # should typically be installed into the venv's bin dir 308 search_paths.insert(0, os.path.dirname(sys.executable)) 309 for path in search_paths: 310 fullpath = os.path.join(path, fname) 311 if os.path.exists(fullpath): 312 LOG.info('Found protoc_gen_mypy at %s' % fullpath) 313 return fullpath 314 raise RuntimeError( 315 "Could not find %s in %s" % (fname, ', '.join(search_paths))) 316 317 318 def find_by_ext(root_dir, ext): 319 for root, _, files in os.walk(root_dir): 320 for file in files: 321 if file.endswith(ext): 322 yield clean_path(os.path.join(root, file)) 323 324 325 def ensure_grpcio_exists(): 326 try: 327 from grpc_tools import protoc # pylint: disable=unused-import 328 except ImportError: 329 return _install_grpcio_tools() 330 331 332 def _install_grpcio_tools(): 333 """ 334 Though wheels are available for grpcio-tools, setup_requires uses 335 easy_install which doesn't understand them. This means that it is 336 compiled from scratch (which is expensive as it compiles the full 337 protoc compiler). Instead, we attempt to install a wheel in a temporary 338 directory and add it to the path as needed. 339 See https://github.com/pypa/setuptools/issues/377 340 """ 341 install_path = os.path.join(PYTHON_SDK_ROOT, '.eggs', 'grpcio-wheels') 342 logging.warning('Installing grpcio-tools into %s', install_path) 343 start = time.time() 344 subprocess.check_call([ 345 sys.executable, 346 '-m', 347 'pip', 348 'install', 349 '--target', 350 install_path, 351 '--upgrade', 352 '-r', 353 os.path.join(PYTHON_SDK_ROOT, 'build-requirements.txt') 354 ]) 355 logging.warning( 356 'Installing grpcio-tools took %0.2f seconds.', time.time() - start) 357 358 return install_path 359 360 361 def build_relative_import(root_path, import_path, start_file_path): 362 tail_path = import_path.replace('.', os.path.sep) 363 source_path = os.path.join(root_path, tail_path) 364 365 is_module = os.path.isfile(source_path + '.py') 366 if is_module: 367 source_path = os.path.dirname(source_path) 368 369 rel_path = os.path.relpath( 370 source_path, start=os.path.dirname(start_file_path)) 371 372 if rel_path == '.': 373 if is_module: 374 rel_path += os.path.basename(tail_path) 375 376 return rel_path 377 378 if rel_path.endswith('..'): 379 rel_path += os.path.sep 380 381 # In a path that looks like ../../../foo, every double dot 382 # after the right most double dot needs to be collapsed to 383 # a single dot to look like ././../foo to which we can convert 384 # to ....foo for the proper relative import. 385 first_half_idx = rel_path.rfind('..' + os.path.sep) 386 if first_half_idx == 0: 387 return rel_path.replace(os.path.sep, '') 388 389 first_half = rel_path[:first_half_idx].replace('..', '.') 390 final_import = first_half.replace(os.path.sep, '') + '..' + \ 391 rel_path[first_half_idx+3:].replace(os.path.sep, '.') 392 393 if is_module: 394 if final_import.count('.') == len(final_import): 395 return final_import + os.path.basename(tail_path) 396 397 return final_import + '.{}'.format(os.path.basename(tail_path)) 398 399 return final_import 400 401 402 def generate_init_files_lite(api_root): 403 proto_root = os.path.join(api_root, 'org') 404 for root, _, _ in os.walk(proto_root): 405 init_file = os.path.join(root, '__init__.py') 406 with open(init_file, 'w+'): 407 pass 408 409 410 def generate_init_files_full(api_root): 411 proto_root = os.path.join(api_root, 'org') 412 api_module_root = os.path.join(api_root, '__init__.py') 413 modules = defaultdict(list) 414 415 for root, _, files in os.walk(proto_root): 416 init_file = os.path.join(root, '__init__.py') 417 with open(init_file, 'w+') as f: 418 f.write(LICENSE_HEADER.lstrip()) 419 for file in files: 420 if not file.endswith('.py') or file == '__init__.py': 421 continue 422 module_name = file.split('.')[0] 423 f.write('from . import {}\n'.format(module_name)) 424 modules[root].append(module_name) 425 426 with open(api_module_root, 'w+') as f: 427 f.write(LICENSE_HEADER.lstrip()) 428 f.write(NO_PROMISES_NOTICE.lstrip()) 429 remaining_lines = [] 430 431 duplicate_modules = {} 432 for module_root, modules in modules.items(): 433 import_path = os.path.relpath(module_root, 434 api_root).replace(os.path.sep, '.') 435 import_root, imported_module = import_path.rsplit('.', 1) 436 437 if imported_module not in duplicate_modules: 438 f.write('from .{} import {}\n'.format(import_root, imported_module)) 439 duplicate_modules[imported_module] = 1 440 else: 441 duplicate_modules[imported_module] += 1 442 module_alias = '{}_{}'.format( 443 imported_module, duplicate_modules[imported_module]) 444 f.write( 445 'from .{} import {} as {}\n'.format( 446 import_root, imported_module, module_alias)) 447 imported_module = module_alias 448 449 for module in modules: 450 remaining_lines.append( 451 '{module} = {}.{module}\n'.format(imported_module, module=module)) 452 f.write('\n') 453 f.writelines(remaining_lines) 454 455 456 def generate_proto_files(force=False): 457 """ 458 Will compile proto files for python. If force is not true, then several 459 heuristics are used to determine whether a compilation is necessary. If 460 a compilation is not necessary, no compilation will be performed. 461 :param force: Whether to force a recompilation of the proto files. 462 """ 463 proto_dirs = [ 464 clean_path(os.path.join(PROJECT_ROOT, path)) for path in BEAM_PROTO_PATHS 465 ] 466 proto_files = [ 467 proto_file for d in proto_dirs for proto_file in find_by_ext(d, '.proto') 468 ] 469 470 out_files = list(find_by_ext(PYTHON_OUTPUT_PATH, '_pb2.py')) 471 472 if out_files and not proto_files and not force: 473 # We have out_files but no protos; assume they're up-to-date. 474 # This is actually the common case (e.g. installation from an sdist). 475 LOG.info('No proto files; using existing generated files.') 476 return 477 478 elif not out_files and not proto_files: 479 model = os.path.join(PROJECT_ROOT, 'model') 480 if os.path.exists(model): 481 error_msg = 'No proto files found in %s.' % proto_dirs 482 else: 483 error_msg = 'Not in apache git tree, unable to find proto definitions.' 484 485 raise RuntimeError(error_msg) 486 487 if force: 488 regenerate_reason = 'forced' 489 elif not out_files: 490 regenerate_reason = 'no output files' 491 elif len(out_files) < len(proto_files): 492 regenerate_reason = 'not enough output files' 493 elif (min(os.path.getmtime(path) for path in out_files) <= max( 494 os.path.getmtime(path) 495 for path in proto_files + [os.path.realpath(__file__)])): 496 regenerate_reason = 'output files are out-of-date' 497 elif len(out_files) > len(proto_files): 498 regenerate_reason = 'output files without corresponding .proto files' 499 # too many output files: probably due to switching between git branches. 500 # remove them so they don't trigger constant regeneration. 501 for out_file in out_files: 502 os.remove(out_file) 503 else: 504 regenerate_reason = '' 505 506 if not regenerate_reason: 507 LOG.info('Skipping proto regeneration: all files up to date') 508 return 509 510 shutil.rmtree(PYTHON_OUTPUT_PATH, ignore_errors=True) 511 if not os.path.exists(PYTHON_OUTPUT_PATH): 512 os.mkdir(PYTHON_OUTPUT_PATH) 513 514 grpcio_install_loc = ensure_grpcio_exists() 515 protoc_gen_mypy = _find_protoc_gen_mypy() 516 with PythonPath(grpcio_install_loc): 517 from grpc_tools import protoc 518 builtin_protos = pkg_resources.resource_filename('grpc_tools', '_proto') 519 args = ( 520 [sys.executable] + # expecting to be called from command line 521 ['--proto_path=%s' % builtin_protos] + 522 ['--proto_path=%s' % d 523 for d in proto_dirs] + ['--python_out=%s' % PYTHON_OUTPUT_PATH] + 524 ['--plugin=protoc-gen-mypy=%s' % protoc_gen_mypy] + 525 # new version of mypy-protobuf converts None to zero default value 526 # and remove Optional from the param type annotation. This causes 527 # some mypy errors. So to mitigate and fall back to old behavior, 528 # use `relax_strict_optional_primitives` flag. more at 529 # https://github.com/nipunn1313/mypy-protobuf/tree/main#relax_strict_optional_primitives # pylint:disable=line-too-long 530 ['--mypy_out=relax_strict_optional_primitives:%s' % PYTHON_OUTPUT_PATH 531 ] + 532 # TODO(robertwb): Remove the prefix once it's the default. 533 ['--grpc_python_out=grpc_2_0:%s' % PYTHON_OUTPUT_PATH] + proto_files) 534 535 LOG.info('Regenerating Python proto definitions (%s).' % regenerate_reason) 536 ret_code = protoc.main(args) 537 if ret_code: 538 raise RuntimeError( 539 'Protoc returned non-zero status (see logs for details): ' 540 '%s' % ret_code) 541 542 # copy resource files 543 for path in MODEL_RESOURCES: 544 shutil.copy2(os.path.join(PROJECT_ROOT, path), PYTHON_OUTPUT_PATH) 545 546 proto_packages = set() 547 # see: https://github.com/protocolbuffers/protobuf/issues/1491 548 # force relative import paths for proto files 549 compiled_import_re = re.compile('^from (.*) import (.*)$') 550 for file_path in find_by_ext(PYTHON_OUTPUT_PATH, 551 ('_pb2.py', '_pb2_grpc.py', '_pb2.pyi')): 552 proto_packages.add(os.path.dirname(file_path)) 553 lines = [] 554 with open(file_path, encoding='utf-8') as f: 555 for line in f: 556 match_obj = compiled_import_re.match(line) 557 if match_obj and \ 558 match_obj.group(1).startswith('org.apache.beam.model'): 559 new_import = build_relative_import( 560 PYTHON_OUTPUT_PATH, match_obj.group(1), file_path) 561 line = 'from %s import %s\n' % (new_import, match_obj.group(2)) 562 563 lines.append(line) 564 565 with open(file_path, 'w') as f: 566 f.writelines(lines) 567 568 generate_init_files_lite(PYTHON_OUTPUT_PATH) 569 with PythonPath(grpcio_install_loc): 570 for proto_package in proto_packages: 571 generate_urn_files(proto_package, PYTHON_OUTPUT_PATH) 572 573 generate_init_files_full(PYTHON_OUTPUT_PATH) 574 575 576 if __name__ == '__main__': 577 generate_proto_files(force=True)