github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/gcp/cloud_dlp.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """``PTransforms`` that implement Google Cloud Data Loss Prevention
    19  functionality.
    20  """
    21  
    22  import logging
    23  from typing import List
    24  
    25  from google.cloud import dlp_v2
    26  
    27  from apache_beam import typehints
    28  from apache_beam.options.pipeline_options import GoogleCloudOptions
    29  from apache_beam.transforms import DoFn
    30  from apache_beam.transforms import ParDo
    31  from apache_beam.transforms import PTransform
    32  
    33  __all__ = ['MaskDetectedDetails', 'InspectForDetails']
    34  
    35  _LOGGER = logging.getLogger(__name__)
    36  
    37  
    38  @typehints.with_input_types(str)
    39  @typehints.with_output_types(str)
    40  class MaskDetectedDetails(PTransform):
    41    """Scrubs sensitive information detected in text.
    42    The ``PTransform`` returns a ``PCollection`` of ``str``
    43    Example usage::
    44  
    45      pipeline | MaskDetectedDetails(project='example-gcp-project',
    46        deidentification_config={
    47            'info_type_transformations: {
    48                'transformations': [{
    49                    'primitive_transformation': {
    50                        'character_mask_config': {
    51                            'masking_character': '#'
    52                        }
    53                    }
    54                }]
    55            }
    56        }, inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]})
    57  
    58    """
    59    def __init__(
    60        self,
    61        project=None,
    62        deidentification_template_name=None,
    63        deidentification_config=None,
    64        inspection_template_name=None,
    65        inspection_config=None,
    66        timeout=None):
    67      """Initializes a :class:`MaskDetectedDetails` transform.
    68  
    69      Args:
    70        project: Optional. GCP project name in which inspection will be performed
    71        deidentification_template_name (str): Either this or
    72          `deidentification_config` required. Name of
    73          deidentification template to be used on detected sensitive information
    74          instances in text.
    75        deidentification_config
    76          (``Union[dict, google.cloud.dlp_v2.types.DeidentifyConfig]``):
    77          Configuration for the de-identification of the content item.
    78          If both template name and config are supplied,
    79          config is more important.
    80        inspection_template_name (str): This or `inspection_config` required.
    81          Name of inspection template to be used
    82          to detect sensitive data in text.
    83        inspection_config
    84          (``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``):
    85          Configuration for the inspector used to detect sensitive data in text.
    86          If both template name and config are supplied,
    87          config takes precedence.
    88        timeout (float): Optional. The amount of time, in seconds, to wait for
    89          the request to complete.
    90  
    91      """
    92      self.config = {}
    93      self.project = project
    94      self.timeout = timeout
    95      if deidentification_template_name is not None \
    96          and deidentification_config is not None:
    97        raise ValueError(
    98            'Both deidentification_template_name and '
    99            'deidentification_config were specified.'
   100            ' Please specify only one of these.')
   101      elif deidentification_template_name is None \
   102          and deidentification_config is None:
   103        raise ValueError(
   104            'deidentification_template_name or '
   105            'deidentification_config must be specified.')
   106      elif deidentification_template_name is not None:
   107        self.config['deidentify_template_name'] = deidentification_template_name
   108      else:
   109        self.config['deidentify_config'] = deidentification_config
   110  
   111      if inspection_config is None and inspection_template_name is None:
   112        raise ValueError(
   113            'inspection_template_name or inspection_config must be specified')
   114      if inspection_template_name is not None:
   115        self.config['inspect_template_name'] = inspection_template_name
   116      if inspection_config is not None:
   117        self.config['inspect_config'] = inspection_config
   118  
   119    def expand(self, pcoll):
   120      if self.project is None:
   121        self.project = pcoll.pipeline.options.view_as(GoogleCloudOptions).project
   122      if self.project is None:
   123        raise ValueError(
   124            'GCP project name needs to be specified in "project" pipeline option')
   125      return (
   126          pcoll
   127          | ParDo(_DeidentifyFn(self.config, self.timeout, self.project)))
   128  
   129  
   130  @typehints.with_input_types(str)
   131  @typehints.with_output_types(List[dlp_v2.types.dlp.Finding])
   132  class InspectForDetails(PTransform):
   133    """Inspects input text for sensitive information.
   134    the ``PTransform`` returns a ``PCollection`` of
   135    ``List[google.cloud.dlp_v2.proto.dlp_pb2.Finding]``
   136    Example usage::
   137  
   138        pipeline | InspectForDetails(project='example-gcp-project',
   139                  inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]})
   140    """
   141    def __init__(
   142        self,
   143        project=None,
   144        inspection_template_name=None,
   145        inspection_config=None,
   146        timeout=None):
   147      """Initializes a :class:`InspectForDetails` transform.
   148  
   149      Args:
   150        project: Optional. GCP project name in which inspection will be performed
   151        inspection_template_name (str): This or `inspection_config` required.
   152          Name of inspection template to be used
   153          to detect sensitive data in text.
   154        inspection_config
   155          (``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``):
   156          Configuration for the inspector used to detect sensitive data in text.
   157          If both template name and config are supplied,
   158          config takes precedence.
   159        timeout (float): Optional. The amount of time, in seconds, to wait for
   160          the request to complete.
   161  
   162      """
   163      self.timeout = timeout
   164      self.config = {}
   165      self.project = project
   166      if inspection_config is None and inspection_template_name is None:
   167        raise ValueError(
   168            'inspection_template_name or inspection_config must be specified')
   169      if inspection_template_name is not None:
   170        self.config['inspect_template_name'] = inspection_template_name
   171      if inspection_config is not None:
   172        self.config['inspect_config'] = inspection_config
   173  
   174    def expand(self, pcoll):
   175      if self.project is None:
   176        self.project = pcoll.pipeline.options.view_as(GoogleCloudOptions).project
   177      if self.project is None:
   178        raise ValueError(
   179            'GCP project name needs to be specified in "project" pipeline option')
   180      return pcoll | ParDo(_InspectFn(self.config, self.timeout, self.project))
   181  
   182  
   183  class _DeidentifyFn(DoFn):
   184    def __init__(self, config=None, timeout=None, project=None, client=None):
   185      self.config = config
   186      self.timeout = timeout
   187      self.client = client
   188      self.project = project
   189      self.params = {}
   190  
   191    def setup(self):
   192      if self.client is None:
   193        self.client = dlp_v2.DlpServiceClient()
   194      self.params = {
   195          'timeout': self.timeout,
   196      }
   197      self.parent = self.client.common_project_path(self.project)
   198  
   199    def process(self, element, **kwargs):
   200      request = {'item': {'value': element}, 'parent': self.parent}
   201      request.update(self.config)
   202      operation = self.client.deidentify_content(request=request, **self.params)
   203      yield operation.item.value
   204  
   205  
   206  class _InspectFn(DoFn):
   207    def __init__(self, config=None, timeout=None, project=None):
   208      self.config = config
   209      self.timeout = timeout
   210      self.client = None
   211      self.project = project
   212      self.params = {}
   213  
   214    def setup(self):
   215      if self.client is None:
   216        self.client = dlp_v2.DlpServiceClient()
   217      self.params = {
   218          'timeout': self.timeout,
   219      }
   220      self.parent = self.client.common_project_path(self.project)
   221  
   222    def process(self, element, **kwargs):
   223      request = {'item': {'value': element}, 'parent': self.parent}
   224      request.update(self.config)
   225      operation = self.client.inspect_content(request=request, **self.params)
   226      hits = [x for x in operation.result.findings]
   227      yield hits