github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/gcp/naturallanguageml.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 from typing import Mapping 19 from typing import Optional 20 from typing import Sequence 21 from typing import Tuple 22 from typing import Union 23 24 import apache_beam as beam 25 from apache_beam.metrics import Metrics 26 27 try: 28 from google.cloud import language 29 from google.cloud import language_v1 30 except ImportError: 31 raise ImportError( 32 'Google Cloud Natural Language API not supported for this execution ' 33 'environment (could not import Natural Language API client).') 34 35 __all__ = ['Document', 'AnnotateText'] 36 37 38 class Document(object): 39 """Represents the input to :class:`AnnotateText` transform. 40 41 Args: 42 content (str): The content of the input or the Google Cloud Storage URI 43 where the file is stored. 44 type (`Union[str, google.cloud.language_v1.Document.Type]`): Text type. 45 Possible values are `HTML`, `PLAIN_TEXT`. The default value is 46 `PLAIN_TEXT`. 47 language_hint (`Optional[str]`): The language of the text. If not specified, 48 language will be automatically detected. Values should conform to 49 ISO-639-1 standard. 50 encoding (`Optional[str]`): Text encoding. Possible values are: `NONE`, 51 `UTF8`, `UTF16`, `UTF32`. The default value is `UTF8`. 52 from_gcs (bool): Whether the content should be interpret as a Google Cloud 53 Storage URI. The default value is :data:`False`. 54 """ 55 56 def __init__( 57 self, 58 content, # type: str 59 type='PLAIN_TEXT', # type: Union[str, language_v1.Document.Type] 60 language_hint=None, # type: Optional[str] 61 encoding='UTF8', # type: Optional[str] 62 from_gcs=False # type: bool 63 ): 64 self.content = content 65 self.type = type 66 self.encoding = encoding 67 self.language_hint = language_hint 68 self.from_gcs = from_gcs 69 70 @staticmethod 71 def to_dict(document): 72 # type: (Document) -> Mapping[str, Optional[str]] 73 if document.from_gcs: 74 dict_repr = {'gcs_content_uri': document.content} 75 else: 76 dict_repr = {'content': document.content} 77 dict_repr.update({ 78 'type': document.type, 'language': document.language_hint 79 }) 80 return dict_repr 81 82 83 @beam.ptransform_fn 84 def AnnotateText( 85 pcoll, # type: beam.pvalue.PCollection 86 features, # type: Union[Mapping[str, bool], language_v1.AnnotateTextRequest.Features] 87 timeout=None, # type: Optional[float] 88 metadata=None # type: Optional[Sequence[Tuple[str, str]]] 89 ): 90 """A :class:`~apache_beam.transforms.ptransform.PTransform` 91 for annotating text using the Google Cloud Natural Language API: 92 https://cloud.google.com/natural-language/docs. 93 94 Args: 95 pcoll (:class:`~apache_beam.pvalue.PCollection`): An input PCollection of 96 :class:`Document` objects. 97 features (`Union[Mapping[str, bool], types.AnnotateTextRequest.Features]`): 98 A dictionary of natural language operations to be performed on given 99 text in the following format:: 100 {'extact_syntax'=True, 'extract_entities'=True} 101 102 timeout (`Optional[float]`): The amount of time, in seconds, to wait 103 for the request to complete. The timeout applies to each individual 104 retry attempt. 105 metadata (`Optional[Sequence[Tuple[str, str]]]`): Additional metadata 106 that is provided to the method. 107 """ 108 return pcoll | beam.ParDo(_AnnotateTextFn(features, timeout, metadata)) 109 110 111 @beam.typehints.with_input_types(Document) 112 @beam.typehints.with_output_types(language_v1.AnnotateTextResponse) 113 class _AnnotateTextFn(beam.DoFn): 114 def __init__( 115 self, 116 features, # type: Union[Mapping[str, bool], language_v1.AnnotateTextRequest.Features] 117 timeout, # type: Optional[float] 118 metadata=None # type: Optional[Sequence[Tuple[str, str]]] 119 ): 120 self.features = features 121 self.timeout = timeout 122 self.metadata = metadata 123 self.api_calls = Metrics.counter(self.__class__.__name__, 'api_calls') 124 self.client = None 125 126 def setup(self): 127 self.client = self._get_api_client() 128 129 @staticmethod 130 def _get_api_client(): 131 # type: () -> language.LanguageServiceClient 132 return language.LanguageServiceClient() 133 134 def process(self, element): 135 response = self.client.annotate_text( 136 document=Document.to_dict(element), 137 features=self.features, 138 encoding_type=element.encoding, 139 timeout=self.timeout, 140 metadata=self.metadata) 141 self.api_calls.inc() 142 yield response