github.com/apache/beam/sdks/v2@v2.48.2/python/gen_protos.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Generates Python proto modules and grpc stubs for Beam protos.
    20  """
    21  
    22  import contextlib
    23  import glob
    24  import inspect
    25  import logging
    26  import os
    27  import platform
    28  import re
    29  import shutil
    30  import subprocess
    31  import sys
    32  import time
    33  from collections import defaultdict
    34  from importlib import import_module
    35  
    36  import pkg_resources
    37  
    38  LOG = logging.getLogger()
    39  LOG.setLevel(logging.INFO)
    40  
    41  LICENSE_HEADER = """
    42  #
    43  # Licensed to the Apache Software Foundation (ASF) under one or more
    44  # contributor license agreements.  See the NOTICE file distributed with
    45  # this work for additional information regarding copyright ownership.
    46  # The ASF licenses this file to You under the Apache License, Version 2.0
    47  # (the "License"); you may not use this file except in compliance with
    48  # the License.  You may obtain a copy of the License at
    49  #
    50  #    http://www.apache.org/licenses/LICENSE-2.0
    51  #
    52  # Unless required by applicable law or agreed to in writing, software
    53  # distributed under the License is distributed on an "AS IS" BASIS,
    54  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    55  # See the License for the specific language governing permissions and
    56  # limitations under the License.
    57  #
    58  """
    59  
    60  NO_PROMISES_NOTICE = """
    61  \"\"\"
    62  For internal use only; no backwards-compatibility guarantees.
    63  Automatically generated when running setup.py sdist or build[_py].
    64  \"\"\"
    65  """
    66  
    67  
    68  def clean_path(path):
    69    return os.path.realpath(os.path.abspath(path))
    70  
    71  
    72  # These paths are relative to the project root
    73  BEAM_PROTO_PATHS = [
    74      os.path.join('model', 'pipeline', 'src', 'main', 'proto'),
    75      os.path.join('model', 'job-management', 'src', 'main', 'proto'),
    76      os.path.join('model', 'fn-execution', 'src', 'main', 'proto'),
    77      os.path.join('model', 'interactive', 'src', 'main', 'proto'),
    78  ]
    79  
    80  PYTHON_SDK_ROOT = os.path.dirname(clean_path(__file__))
    81  PROJECT_ROOT = clean_path(os.path.join(PYTHON_SDK_ROOT, '..', '..'))
    82  PYTHON_OUTPUT_PATH = os.path.join(
    83      PYTHON_SDK_ROOT, 'apache_beam', 'portability', 'api')
    84  
    85  MODEL_RESOURCES = [
    86      os.path.normpath((
    87          'model/fn-execution/src/main/resources/org/'
    88          'apache/beam/model/fnexecution/v1/standard_coders.yaml')),
    89  ]
    90  
    91  
    92  class PythonPath(object):
    93    def __init__(self, path: str, front: bool = False):
    94      self._path = path
    95      self._front = front
    96  
    97    def __enter__(self):
    98      if not self._path:
    99        return
   100  
   101      self._sys_path = sys.path.copy()
   102      if self._front:
   103        sys.path.insert(0, self._path)
   104      else:
   105        sys.path.append(self._path)
   106  
   107    def __exit__(self, exc_type, exc_val, exc_tb):
   108      if not self._path:
   109        return
   110  
   111      sys.path = self._sys_path
   112  
   113  
   114  def generate_urn_files(out_dir, api_path):
   115    """
   116    Create python files with statically defined URN constants.
   117  
   118    Creates a <proto>_pb2_urn.py file for each <proto>_pb2.py file that contains
   119    an enum type.
   120  
   121    This works by importing each api.<proto>_pb2 module created by `protoc`,
   122    inspecting the module's contents, and generating a new side-car urn module.
   123    This is executed at build time rather than dynamically on import to ensure
   124    that it is compatible with static type checkers like mypy.
   125    """
   126    from google.protobuf import message
   127    from google.protobuf.internal import api_implementation
   128    if api_implementation.Type() == 'python':
   129      from google.protobuf.internal import containers
   130      repeated_types = (
   131          list,
   132          containers.RepeatedScalarFieldContainer,
   133          containers.RepeatedCompositeFieldContainer)
   134    elif api_implementation.Type() == 'upb':
   135      from google._upb import _message
   136      repeated_types = (
   137          list,
   138          _message.RepeatedScalarContainer,
   139          _message.RepeatedCompositeContainer)
   140    elif api_implementation.Type() == 'cpp':
   141      from google.protobuf.pyext import _message
   142      repeated_types = (
   143          list,
   144          _message.RepeatedScalarContainer,
   145          _message.RepeatedCompositeContainer)
   146    else:
   147      raise TypeError(
   148          "Unknown proto implementation: " + api_implementation.Type())
   149  
   150    class Context(object):
   151      INDENT = '  '
   152      CAP_SPLIT = re.compile('([A-Z][^A-Z]*|^[a-z]+)')
   153  
   154      def __init__(self, indent=0):
   155        self.lines = []
   156        self.imports = set()
   157        self.empty_types = set()
   158        self._indent = indent
   159  
   160      @contextlib.contextmanager
   161      def indent(self):
   162        self._indent += 1
   163        yield
   164        self._indent -= 1
   165  
   166      def prepend(self, s):
   167        if s:
   168          self.lines.insert(0, (self.INDENT * self._indent) + s + '\n')
   169        else:
   170          self.lines.insert(0, '\n')
   171  
   172      def line(self, s):
   173        if s:
   174          self.lines.append((self.INDENT * self._indent) + s + '\n')
   175        else:
   176          self.lines.append('\n')
   177  
   178      def import_type(self, typ):
   179        modname = typ.__module__
   180        if modname in ('__builtin__', 'builtin'):
   181          return typ.__name__
   182        else:
   183          self.imports.add(modname)
   184          _, modname = modname.rsplit('.', 1)
   185          return modname + '.' + typ.__name__
   186  
   187      @staticmethod
   188      def is_message_type(obj):
   189        return isinstance(obj, type) and \
   190               issubclass(obj, message.Message)
   191  
   192      @staticmethod
   193      def is_enum_type(obj):
   194        return type(obj).__name__ == 'EnumTypeWrapper'
   195  
   196      def python_repr(self, obj):
   197        if isinstance(obj, message.Message):
   198          return self.message_repr(obj)
   199        elif isinstance(obj, repeated_types):
   200          return '[%s]' % ', '.join(self.python_repr(x) for x in obj)
   201        else:
   202          return repr(obj)
   203  
   204      def empty_type(self, typ):
   205        name = (
   206            'EMPTY_' +
   207            '_'.join(x.upper() for x in self.CAP_SPLIT.findall(typ.__name__)))
   208        self.empty_types.add('%s = %s()' % (name, self.import_type(typ)))
   209        return name
   210  
   211      def message_repr(self, msg):
   212        parts = []
   213        for field, value in msg.ListFields():
   214          parts.append('%s=%s' % (field.name, self.python_repr(value)))
   215        if parts:
   216          return '%s(%s)' % (self.import_type(type(msg)), ', '.join(parts))
   217        else:
   218          return self.empty_type(type(msg))
   219  
   220      def write_enum(self, enum_name, enum, indent):
   221        ctx = Context(indent=indent)
   222        with ctx.indent():
   223          for enum_value_name in enum.values_by_name:
   224            enum_value_descriptor = enum.values_by_name[enum_value_name]
   225            extensions = enum_value_descriptor.GetOptions().Extensions
   226            prop = (
   227                extensions[beam_runner_api_pb2.beam_urn],
   228                extensions[beam_runner_api_pb2.beam_constant],
   229                extensions[metrics_pb2.monitoring_info_spec],
   230                extensions[metrics_pb2.label_props],
   231            )
   232            reprs = [self.python_repr(x) for x in prop]
   233            if all(x == "''" or x.startswith('EMPTY_') for x in reprs):
   234              continue
   235            ctx.line(
   236                '%s = PropertiesFromEnumValue(%s)' %
   237                (enum_value_name, ', '.join(self.python_repr(x) for x in prop)))
   238  
   239        if ctx.lines:
   240          ctx.prepend('class %s(object):' % enum_name)
   241          ctx.prepend('')
   242          ctx.line('')
   243        return ctx.lines
   244  
   245      def write_message(self, message_name, message, indent=0):
   246        ctx = Context(indent=indent)
   247  
   248        with ctx.indent():
   249          for obj_name, obj in inspect.getmembers(message):
   250            if obj_name == 'DESCRIPTOR':
   251              for enum_name in obj.enum_types_by_name:
   252                enum = obj.enum_types_by_name[enum_name]
   253                ctx.lines += self.write_enum(enum_name, enum, ctx._indent)
   254  
   255        if ctx.lines:
   256          ctx.prepend('class %s(object):' % message_name)
   257          ctx.prepend('')
   258        return ctx.lines
   259  
   260    pb2_files = list(glob.glob(os.path.join(out_dir, '*_pb2.py')))
   261  
   262    with PythonPath(os.path.dirname(api_path), front=True):
   263      beam_runner_api_pb2 = import_module(
   264          'api.org.apache.beam.model.pipeline.v1.beam_runner_api_pb2')
   265      metrics_pb2 = import_module(
   266          'api.org.apache.beam.model.pipeline.v1.metrics_pb2')
   267  
   268      for pb2_file in pb2_files:
   269        modname = os.path.splitext(pb2_file)[0]
   270        out_file = modname + '_urns.py'
   271        api_start_idx = modname.index(os.path.sep + 'api' + os.path.sep)
   272        import_path = modname[api_start_idx + 1:].replace(os.path.sep, '.')
   273        mod = import_module(import_path)
   274  
   275        ctx = Context()
   276        for obj_name, obj in inspect.getmembers(mod):
   277          if ctx.is_message_type(obj):
   278            ctx.lines += ctx.write_message(obj_name, obj)
   279  
   280        if ctx.lines:
   281          for line in reversed(sorted(ctx.empty_types)):
   282            ctx.prepend(line)
   283  
   284          for modname in reversed(sorted(ctx.imports)):
   285            pkg, target = modname.rsplit('.', 1)
   286            rel_import = build_relative_import(api_path, pkg, out_file)
   287            ctx.prepend('from %s import %s' % (rel_import, target))
   288  
   289          rel_import = build_relative_import(
   290              os.path.dirname(api_path), 'utils', out_file)
   291          ctx.prepend('from %s import PropertiesFromEnumValue' % rel_import)
   292  
   293          LOG.info("Writing urn stubs: %s" % out_file)
   294          with open(out_file, 'w') as f:
   295            f.writelines(ctx.lines)
   296  
   297  
   298  def _find_protoc_gen_mypy():
   299    # NOTE: this shouldn't be necessary if the virtualenv's environment
   300    #  is passed to tasks below it, since protoc will search the PATH itself
   301    fname = 'protoc-gen-mypy'
   302    if platform.system() == 'Windows':
   303      fname += ".exe"
   304  
   305    pathstr = os.environ.get('PATH')
   306    search_paths = pathstr.split(os.pathsep) if pathstr else []
   307    # should typically be installed into the venv's bin dir
   308    search_paths.insert(0, os.path.dirname(sys.executable))
   309    for path in search_paths:
   310      fullpath = os.path.join(path, fname)
   311      if os.path.exists(fullpath):
   312        LOG.info('Found protoc_gen_mypy at %s' % fullpath)
   313        return fullpath
   314    raise RuntimeError(
   315        "Could not find %s in %s" % (fname, ', '.join(search_paths)))
   316  
   317  
   318  def find_by_ext(root_dir, ext):
   319    for root, _, files in os.walk(root_dir):
   320      for file in files:
   321        if file.endswith(ext):
   322          yield clean_path(os.path.join(root, file))
   323  
   324  
   325  def ensure_grpcio_exists():
   326    try:
   327      from grpc_tools import protoc  # pylint: disable=unused-import
   328    except ImportError:
   329      return _install_grpcio_tools()
   330  
   331  
   332  def _install_grpcio_tools():
   333    """
   334    Though wheels are available for grpcio-tools, setup_requires uses
   335    easy_install which doesn't understand them.  This means that it is
   336    compiled from scratch (which is expensive as it compiles the full
   337    protoc compiler).  Instead, we attempt to install a wheel in a temporary
   338    directory and add it to the path as needed.
   339    See https://github.com/pypa/setuptools/issues/377
   340    """
   341    install_path = os.path.join(PYTHON_SDK_ROOT, '.eggs', 'grpcio-wheels')
   342    logging.warning('Installing grpcio-tools into %s', install_path)
   343    start = time.time()
   344    subprocess.check_call([
   345        sys.executable,
   346        '-m',
   347        'pip',
   348        'install',
   349        '--target',
   350        install_path,
   351        '--upgrade',
   352        '-r',
   353        os.path.join(PYTHON_SDK_ROOT, 'build-requirements.txt')
   354    ])
   355    logging.warning(
   356        'Installing grpcio-tools took %0.2f seconds.', time.time() - start)
   357  
   358    return install_path
   359  
   360  
   361  def build_relative_import(root_path, import_path, start_file_path):
   362    tail_path = import_path.replace('.', os.path.sep)
   363    source_path = os.path.join(root_path, tail_path)
   364  
   365    is_module = os.path.isfile(source_path + '.py')
   366    if is_module:
   367      source_path = os.path.dirname(source_path)
   368  
   369    rel_path = os.path.relpath(
   370        source_path, start=os.path.dirname(start_file_path))
   371  
   372    if rel_path == '.':
   373      if is_module:
   374        rel_path += os.path.basename(tail_path)
   375  
   376      return rel_path
   377  
   378    if rel_path.endswith('..'):
   379      rel_path += os.path.sep
   380  
   381    # In a path that looks like ../../../foo, every double dot
   382    # after the right most double dot needs to be collapsed to
   383    # a single dot to look like ././../foo to which we can convert
   384    # to ....foo for the proper relative import.
   385    first_half_idx = rel_path.rfind('..' + os.path.sep)
   386    if first_half_idx == 0:
   387      return rel_path.replace(os.path.sep, '')
   388  
   389    first_half = rel_path[:first_half_idx].replace('..', '.')
   390    final_import = first_half.replace(os.path.sep, '') + '..' + \
   391           rel_path[first_half_idx+3:].replace(os.path.sep, '.')
   392  
   393    if is_module:
   394      if final_import.count('.') == len(final_import):
   395        return final_import + os.path.basename(tail_path)
   396  
   397      return final_import + '.{}'.format(os.path.basename(tail_path))
   398  
   399    return final_import
   400  
   401  
   402  def generate_init_files_lite(api_root):
   403    proto_root = os.path.join(api_root, 'org')
   404    for root, _, _ in os.walk(proto_root):
   405      init_file = os.path.join(root, '__init__.py')
   406      with open(init_file, 'w+'):
   407        pass
   408  
   409  
   410  def generate_init_files_full(api_root):
   411    proto_root = os.path.join(api_root, 'org')
   412    api_module_root = os.path.join(api_root, '__init__.py')
   413    modules = defaultdict(list)
   414  
   415    for root, _, files in os.walk(proto_root):
   416      init_file = os.path.join(root, '__init__.py')
   417      with open(init_file, 'w+') as f:
   418        f.write(LICENSE_HEADER.lstrip())
   419        for file in files:
   420          if not file.endswith('.py') or file == '__init__.py':
   421            continue
   422          module_name = file.split('.')[0]
   423          f.write('from . import {}\n'.format(module_name))
   424          modules[root].append(module_name)
   425  
   426    with open(api_module_root, 'w+') as f:
   427      f.write(LICENSE_HEADER.lstrip())
   428      f.write(NO_PROMISES_NOTICE.lstrip())
   429      remaining_lines = []
   430  
   431      duplicate_modules = {}
   432      for module_root, modules in modules.items():
   433        import_path = os.path.relpath(module_root,
   434                                      api_root).replace(os.path.sep, '.')
   435        import_root, imported_module = import_path.rsplit('.', 1)
   436  
   437        if imported_module not in duplicate_modules:
   438          f.write('from .{} import {}\n'.format(import_root, imported_module))
   439          duplicate_modules[imported_module] = 1
   440        else:
   441          duplicate_modules[imported_module] += 1
   442          module_alias = '{}_{}'.format(
   443              imported_module, duplicate_modules[imported_module])
   444          f.write(
   445              'from .{} import {} as {}\n'.format(
   446                  import_root, imported_module, module_alias))
   447          imported_module = module_alias
   448  
   449        for module in modules:
   450          remaining_lines.append(
   451              '{module} = {}.{module}\n'.format(imported_module, module=module))
   452      f.write('\n')
   453      f.writelines(remaining_lines)
   454  
   455  
   456  def generate_proto_files(force=False):
   457    """
   458    Will compile proto files for python. If force is not true, then several
   459    heuristics are used to determine whether a compilation is necessary. If
   460    a compilation is not necessary, no compilation will be performed.
   461    :param force: Whether to force a recompilation of the proto files.
   462    """
   463    proto_dirs = [
   464        clean_path(os.path.join(PROJECT_ROOT, path)) for path in BEAM_PROTO_PATHS
   465    ]
   466    proto_files = [
   467        proto_file for d in proto_dirs for proto_file in find_by_ext(d, '.proto')
   468    ]
   469  
   470    out_files = list(find_by_ext(PYTHON_OUTPUT_PATH, '_pb2.py'))
   471  
   472    if out_files and not proto_files and not force:
   473      # We have out_files but no protos; assume they're up-to-date.
   474      # This is actually the common case (e.g. installation from an sdist).
   475      LOG.info('No proto files; using existing generated files.')
   476      return
   477  
   478    elif not out_files and not proto_files:
   479      model = os.path.join(PROJECT_ROOT, 'model')
   480      if os.path.exists(model):
   481        error_msg = 'No proto files found in %s.' % proto_dirs
   482      else:
   483        error_msg = 'Not in apache git tree, unable to find proto definitions.'
   484  
   485      raise RuntimeError(error_msg)
   486  
   487    if force:
   488      regenerate_reason = 'forced'
   489    elif not out_files:
   490      regenerate_reason = 'no output files'
   491    elif len(out_files) < len(proto_files):
   492      regenerate_reason = 'not enough output files'
   493    elif (min(os.path.getmtime(path) for path in out_files) <= max(
   494        os.path.getmtime(path)
   495        for path in proto_files + [os.path.realpath(__file__)])):
   496      regenerate_reason = 'output files are out-of-date'
   497    elif len(out_files) > len(proto_files):
   498      regenerate_reason = 'output files without corresponding .proto files'
   499      # too many output files: probably due to switching between git branches.
   500      # remove them so they don't trigger constant regeneration.
   501      for out_file in out_files:
   502        os.remove(out_file)
   503    else:
   504      regenerate_reason = ''
   505  
   506    if not regenerate_reason:
   507      LOG.info('Skipping proto regeneration: all files up to date')
   508      return
   509  
   510    shutil.rmtree(PYTHON_OUTPUT_PATH, ignore_errors=True)
   511    if not os.path.exists(PYTHON_OUTPUT_PATH):
   512      os.mkdir(PYTHON_OUTPUT_PATH)
   513  
   514    grpcio_install_loc = ensure_grpcio_exists()
   515    protoc_gen_mypy = _find_protoc_gen_mypy()
   516    with PythonPath(grpcio_install_loc):
   517      from grpc_tools import protoc
   518      builtin_protos = pkg_resources.resource_filename('grpc_tools', '_proto')
   519      args = (
   520          [sys.executable] +  # expecting to be called from command line
   521          ['--proto_path=%s' % builtin_protos] +
   522          ['--proto_path=%s' % d
   523           for d in proto_dirs] + ['--python_out=%s' % PYTHON_OUTPUT_PATH] +
   524          ['--plugin=protoc-gen-mypy=%s' % protoc_gen_mypy] +
   525          # new version of mypy-protobuf converts None to zero default value
   526          # and remove Optional from the param type annotation. This causes
   527          # some mypy errors. So to mitigate and fall back to old behavior,
   528          # use `relax_strict_optional_primitives` flag. more at
   529          # https://github.com/nipunn1313/mypy-protobuf/tree/main#relax_strict_optional_primitives # pylint:disable=line-too-long
   530          ['--mypy_out=relax_strict_optional_primitives:%s' % PYTHON_OUTPUT_PATH
   531           ] +
   532          # TODO(robertwb): Remove the prefix once it's the default.
   533          ['--grpc_python_out=grpc_2_0:%s' % PYTHON_OUTPUT_PATH] + proto_files)
   534  
   535      LOG.info('Regenerating Python proto definitions (%s).' % regenerate_reason)
   536      ret_code = protoc.main(args)
   537      if ret_code:
   538        raise RuntimeError(
   539            'Protoc returned non-zero status (see logs for details): '
   540            '%s' % ret_code)
   541  
   542    # copy resource files
   543    for path in MODEL_RESOURCES:
   544      shutil.copy2(os.path.join(PROJECT_ROOT, path), PYTHON_OUTPUT_PATH)
   545  
   546    proto_packages = set()
   547    # see: https://github.com/protocolbuffers/protobuf/issues/1491
   548    # force relative import paths for proto files
   549    compiled_import_re = re.compile('^from (.*) import (.*)$')
   550    for file_path in find_by_ext(PYTHON_OUTPUT_PATH,
   551                                 ('_pb2.py', '_pb2_grpc.py', '_pb2.pyi')):
   552      proto_packages.add(os.path.dirname(file_path))
   553      lines = []
   554      with open(file_path, encoding='utf-8') as f:
   555        for line in f:
   556          match_obj = compiled_import_re.match(line)
   557          if match_obj and \
   558                  match_obj.group(1).startswith('org.apache.beam.model'):
   559            new_import = build_relative_import(
   560                PYTHON_OUTPUT_PATH, match_obj.group(1), file_path)
   561            line = 'from %s import %s\n' % (new_import, match_obj.group(2))
   562  
   563          lines.append(line)
   564  
   565      with open(file_path, 'w') as f:
   566        f.writelines(lines)
   567  
   568    generate_init_files_lite(PYTHON_OUTPUT_PATH)
   569    with PythonPath(grpcio_install_loc):
   570      for proto_package in proto_packages:
   571        generate_urn_files(proto_package, PYTHON_OUTPUT_PATH)
   572  
   573      generate_init_files_full(PYTHON_OUTPUT_PATH)
   574  
   575  
   576  if __name__ == '__main__':
   577    generate_proto_files(force=True)