github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/gcp/naturallanguageml.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  from typing import Mapping
    19  from typing import Optional
    20  from typing import Sequence
    21  from typing import Tuple
    22  from typing import Union
    23  
    24  import apache_beam as beam
    25  from apache_beam.metrics import Metrics
    26  
    27  try:
    28    from google.cloud import language
    29    from google.cloud import language_v1
    30  except ImportError:
    31    raise ImportError(
    32        'Google Cloud Natural Language API not supported for this execution '
    33        'environment (could not import Natural Language API client).')
    34  
    35  __all__ = ['Document', 'AnnotateText']
    36  
    37  
    38  class Document(object):
    39    """Represents the input to :class:`AnnotateText` transform.
    40  
    41    Args:
    42      content (str): The content of the input or the Google Cloud Storage URI
    43        where the file is stored.
    44      type (`Union[str, google.cloud.language_v1.Document.Type]`): Text type.
    45        Possible values are `HTML`, `PLAIN_TEXT`. The default value is
    46        `PLAIN_TEXT`.
    47      language_hint (`Optional[str]`): The language of the text. If not specified,
    48        language will be automatically detected. Values should conform to
    49        ISO-639-1 standard.
    50      encoding (`Optional[str]`): Text encoding. Possible values are: `NONE`,
    51       `UTF8`, `UTF16`, `UTF32`. The default value is `UTF8`.
    52      from_gcs (bool): Whether the content should be interpret as a Google Cloud
    53        Storage URI. The default value is :data:`False`.
    54    """
    55  
    56    def __init__(
    57        self,
    58        content,  # type: str
    59        type='PLAIN_TEXT',  # type: Union[str, language_v1.Document.Type]
    60        language_hint=None,  # type: Optional[str]
    61        encoding='UTF8',  # type: Optional[str]
    62        from_gcs=False  # type: bool
    63    ):
    64      self.content = content
    65      self.type = type
    66      self.encoding = encoding
    67      self.language_hint = language_hint
    68      self.from_gcs = from_gcs
    69  
    70    @staticmethod
    71    def to_dict(document):
    72      # type: (Document) -> Mapping[str, Optional[str]]
    73      if document.from_gcs:
    74        dict_repr = {'gcs_content_uri': document.content}
    75      else:
    76        dict_repr = {'content': document.content}
    77      dict_repr.update({
    78          'type': document.type, 'language': document.language_hint
    79      })
    80      return dict_repr
    81  
    82  
    83  @beam.ptransform_fn
    84  def AnnotateText(
    85      pcoll,  # type: beam.pvalue.PCollection
    86      features, # type: Union[Mapping[str, bool], language_v1.AnnotateTextRequest.Features]
    87      timeout=None,  # type: Optional[float]
    88      metadata=None  # type: Optional[Sequence[Tuple[str, str]]]
    89  ):
    90    """A :class:`~apache_beam.transforms.ptransform.PTransform`
    91    for annotating text using the Google Cloud Natural Language API:
    92    https://cloud.google.com/natural-language/docs.
    93  
    94    Args:
    95      pcoll (:class:`~apache_beam.pvalue.PCollection`): An input PCollection of
    96        :class:`Document` objects.
    97      features (`Union[Mapping[str, bool], types.AnnotateTextRequest.Features]`):
    98        A dictionary of natural language operations to be performed on given
    99        text in the following format::
   100        {'extact_syntax'=True, 'extract_entities'=True}
   101  
   102      timeout (`Optional[float]`): The amount of time, in seconds, to wait
   103        for the request to complete. The timeout applies to each individual
   104        retry attempt.
   105      metadata (`Optional[Sequence[Tuple[str, str]]]`): Additional metadata
   106        that is provided to the method.
   107    """
   108    return pcoll | beam.ParDo(_AnnotateTextFn(features, timeout, metadata))
   109  
   110  
   111  @beam.typehints.with_input_types(Document)
   112  @beam.typehints.with_output_types(language_v1.AnnotateTextResponse)
   113  class _AnnotateTextFn(beam.DoFn):
   114    def __init__(
   115        self,
   116        features,  # type: Union[Mapping[str, bool], language_v1.AnnotateTextRequest.Features]
   117        timeout,  # type: Optional[float]
   118        metadata=None  # type: Optional[Sequence[Tuple[str, str]]]
   119    ):
   120      self.features = features
   121      self.timeout = timeout
   122      self.metadata = metadata
   123      self.api_calls = Metrics.counter(self.__class__.__name__, 'api_calls')
   124      self.client = None
   125  
   126    def setup(self):
   127      self.client = self._get_api_client()
   128  
   129    @staticmethod
   130    def _get_api_client():
   131      # type: () -> language.LanguageServiceClient
   132      return language.LanguageServiceClient()
   133  
   134    def process(self, element):
   135      response = self.client.annotate_text(
   136          document=Document.to_dict(element),
   137          features=self.features,
   138          encoding_type=element.encoding,
   139          timeout=self.timeout,
   140          metadata=self.metadata)
   141      self.api_calls.inc()
   142      yield response