github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/gcp/cloud_dlp.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """``PTransforms`` that implement Google Cloud Data Loss Prevention 19 functionality. 20 """ 21 22 import logging 23 from typing import List 24 25 from google.cloud import dlp_v2 26 27 from apache_beam import typehints 28 from apache_beam.options.pipeline_options import GoogleCloudOptions 29 from apache_beam.transforms import DoFn 30 from apache_beam.transforms import ParDo 31 from apache_beam.transforms import PTransform 32 33 __all__ = ['MaskDetectedDetails', 'InspectForDetails'] 34 35 _LOGGER = logging.getLogger(__name__) 36 37 38 @typehints.with_input_types(str) 39 @typehints.with_output_types(str) 40 class MaskDetectedDetails(PTransform): 41 """Scrubs sensitive information detected in text. 42 The ``PTransform`` returns a ``PCollection`` of ``str`` 43 Example usage:: 44 45 pipeline | MaskDetectedDetails(project='example-gcp-project', 46 deidentification_config={ 47 'info_type_transformations: { 48 'transformations': [{ 49 'primitive_transformation': { 50 'character_mask_config': { 51 'masking_character': '#' 52 } 53 } 54 }] 55 } 56 }, inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]}) 57 58 """ 59 def __init__( 60 self, 61 project=None, 62 deidentification_template_name=None, 63 deidentification_config=None, 64 inspection_template_name=None, 65 inspection_config=None, 66 timeout=None): 67 """Initializes a :class:`MaskDetectedDetails` transform. 68 69 Args: 70 project: Optional. GCP project name in which inspection will be performed 71 deidentification_template_name (str): Either this or 72 `deidentification_config` required. Name of 73 deidentification template to be used on detected sensitive information 74 instances in text. 75 deidentification_config 76 (``Union[dict, google.cloud.dlp_v2.types.DeidentifyConfig]``): 77 Configuration for the de-identification of the content item. 78 If both template name and config are supplied, 79 config is more important. 80 inspection_template_name (str): This or `inspection_config` required. 81 Name of inspection template to be used 82 to detect sensitive data in text. 83 inspection_config 84 (``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``): 85 Configuration for the inspector used to detect sensitive data in text. 86 If both template name and config are supplied, 87 config takes precedence. 88 timeout (float): Optional. The amount of time, in seconds, to wait for 89 the request to complete. 90 91 """ 92 self.config = {} 93 self.project = project 94 self.timeout = timeout 95 if deidentification_template_name is not None \ 96 and deidentification_config is not None: 97 raise ValueError( 98 'Both deidentification_template_name and ' 99 'deidentification_config were specified.' 100 ' Please specify only one of these.') 101 elif deidentification_template_name is None \ 102 and deidentification_config is None: 103 raise ValueError( 104 'deidentification_template_name or ' 105 'deidentification_config must be specified.') 106 elif deidentification_template_name is not None: 107 self.config['deidentify_template_name'] = deidentification_template_name 108 else: 109 self.config['deidentify_config'] = deidentification_config 110 111 if inspection_config is None and inspection_template_name is None: 112 raise ValueError( 113 'inspection_template_name or inspection_config must be specified') 114 if inspection_template_name is not None: 115 self.config['inspect_template_name'] = inspection_template_name 116 if inspection_config is not None: 117 self.config['inspect_config'] = inspection_config 118 119 def expand(self, pcoll): 120 if self.project is None: 121 self.project = pcoll.pipeline.options.view_as(GoogleCloudOptions).project 122 if self.project is None: 123 raise ValueError( 124 'GCP project name needs to be specified in "project" pipeline option') 125 return ( 126 pcoll 127 | ParDo(_DeidentifyFn(self.config, self.timeout, self.project))) 128 129 130 @typehints.with_input_types(str) 131 @typehints.with_output_types(List[dlp_v2.types.dlp.Finding]) 132 class InspectForDetails(PTransform): 133 """Inspects input text for sensitive information. 134 the ``PTransform`` returns a ``PCollection`` of 135 ``List[google.cloud.dlp_v2.proto.dlp_pb2.Finding]`` 136 Example usage:: 137 138 pipeline | InspectForDetails(project='example-gcp-project', 139 inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]}) 140 """ 141 def __init__( 142 self, 143 project=None, 144 inspection_template_name=None, 145 inspection_config=None, 146 timeout=None): 147 """Initializes a :class:`InspectForDetails` transform. 148 149 Args: 150 project: Optional. GCP project name in which inspection will be performed 151 inspection_template_name (str): This or `inspection_config` required. 152 Name of inspection template to be used 153 to detect sensitive data in text. 154 inspection_config 155 (``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``): 156 Configuration for the inspector used to detect sensitive data in text. 157 If both template name and config are supplied, 158 config takes precedence. 159 timeout (float): Optional. The amount of time, in seconds, to wait for 160 the request to complete. 161 162 """ 163 self.timeout = timeout 164 self.config = {} 165 self.project = project 166 if inspection_config is None and inspection_template_name is None: 167 raise ValueError( 168 'inspection_template_name or inspection_config must be specified') 169 if inspection_template_name is not None: 170 self.config['inspect_template_name'] = inspection_template_name 171 if inspection_config is not None: 172 self.config['inspect_config'] = inspection_config 173 174 def expand(self, pcoll): 175 if self.project is None: 176 self.project = pcoll.pipeline.options.view_as(GoogleCloudOptions).project 177 if self.project is None: 178 raise ValueError( 179 'GCP project name needs to be specified in "project" pipeline option') 180 return pcoll | ParDo(_InspectFn(self.config, self.timeout, self.project)) 181 182 183 class _DeidentifyFn(DoFn): 184 def __init__(self, config=None, timeout=None, project=None, client=None): 185 self.config = config 186 self.timeout = timeout 187 self.client = client 188 self.project = project 189 self.params = {} 190 191 def setup(self): 192 if self.client is None: 193 self.client = dlp_v2.DlpServiceClient() 194 self.params = { 195 'timeout': self.timeout, 196 } 197 self.parent = self.client.common_project_path(self.project) 198 199 def process(self, element, **kwargs): 200 request = {'item': {'value': element}, 'parent': self.parent} 201 request.update(self.config) 202 operation = self.client.deidentify_content(request=request, **self.params) 203 yield operation.item.value 204 205 206 class _InspectFn(DoFn): 207 def __init__(self, config=None, timeout=None, project=None): 208 self.config = config 209 self.timeout = timeout 210 self.client = None 211 self.project = project 212 self.params = {} 213 214 def setup(self): 215 if self.client is None: 216 self.client = dlp_v2.DlpServiceClient() 217 self.params = { 218 'timeout': self.timeout, 219 } 220 self.parent = self.client.common_project_path(self.project) 221 222 def process(self, element, **kwargs): 223 request = {'item': {'value': element}, 'parent': self.parent} 224 request.update(self.config) 225 operation = self.client.inspect_content(request=request, **self.params) 226 hits = [x for x in operation.result.findings] 227 yield hits