github.com/aldelo/common@v1.5.1/wrapper/textract/textract.go (about) 1 package textract 2 3 /* 4 * Copyright 2020-2024 Aldelo, LP 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 // ================================================================================================================= 20 // AWS CREDENTIAL: 21 // use $> aws configure (to set aws access key and secret to target machine) 22 // Store AWS Access ID and Secret Key into Default Profile Using '$ aws configure' cli 23 // 24 // To Install & Setup AWS CLI on Host: 25 // 1) https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2-linux.html 26 // On Ubuntu, if host does not have zip and unzip: 27 // $> sudo apt install zip 28 // $> sudo apt install unzip 29 // On Ubuntu, to install AWS CLI v2: 30 // $> curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 31 // $> unzip awscliv2.zip 32 // $> sudo ./aws/install 33 // 2) $> aws configure set region awsRegionName --profile default 34 // 3) $> aws configure 35 // follow prompts to enter Access ID and Secret Key 36 // 37 // AWS Region Name Reference: 38 // us-west-2, us-east-1, ap-northeast-1, etc 39 // See: https://docs.aws.amazon.com/general/latest/gr/rande.html 40 // ================================================================================================================= 41 42 import ( 43 "context" 44 "errors" 45 "net/http" 46 47 awshttp2 "github.com/aldelo/common/wrapper/aws" 48 "github.com/aldelo/common/wrapper/aws/awsregion" 49 "github.com/aldelo/common/wrapper/xray" 50 "github.com/aws/aws-sdk-go-v2/config" 51 "github.com/aws/aws-sdk-go-v2/service/textract" 52 "github.com/aws/aws-sdk-go-v2/service/textract/types" 53 ) 54 55 // ================================================================================================================ 56 // STRUCTS 57 // ================================================================================================================ 58 59 // Textract struct encapsulates the AWS Textract access functionality 60 type Textract struct { 61 // define the AWS region that Textract is located at 62 AwsRegion awsregion.AWSRegion 63 64 // custom http2 client options 65 HttpOptions *awshttp2.HttpClientSettings 66 67 // store Textract client object 68 textractClient *textract.Client 69 70 _parentSegment *xray.XRayParentSegment 71 } 72 73 // ================================================================================================================ 74 // STRUCTS FUNCTIONS 75 // ================================================================================================================ 76 77 // ---------------------------------------------------------------------------------------------------------------- 78 // utility functions 79 // ---------------------------------------------------------------------------------------------------------------- 80 81 // Connect will establish a connection to the Textract service 82 func (s *Textract) Connect(parentSegment ...*xray.XRayParentSegment) (err error) { 83 if xray.XRayServiceOn() { 84 if len(parentSegment) > 0 { 85 s._parentSegment = parentSegment[0] 86 } 87 88 seg := xray.NewSegment("Textract-Connect", s._parentSegment) 89 defer seg.Close() 90 defer func() { 91 _ = seg.Seg.AddMetadata("Textract-AWS-Region", s.AwsRegion) 92 93 if err != nil { 94 _ = seg.Seg.AddError(err) 95 } 96 }() 97 98 err = s.connectInternal(seg.Ctx) 99 100 return err 101 } else { 102 return s.connectInternal(context.Background()) 103 } 104 } 105 106 // Connect will establish a connection to the Textract service 107 func (s *Textract) connectInternal(ctx context.Context) error { 108 // clean up prior textract client reference 109 s.textractClient = nil 110 111 if !s.AwsRegion.Valid() || s.AwsRegion == awsregion.UNKNOWN { 112 return errors.New("Connect to Textract Failed: (AWS Session Error) " + "Region is Required") 113 } 114 115 // create custom http2 client if needed 116 var httpCli *http.Client 117 var httpErr error 118 119 if s.HttpOptions == nil { 120 s.HttpOptions = new(awshttp2.HttpClientSettings) 121 } 122 123 // use custom http2 client 124 h2 := &awshttp2.AwsHttp2Client{ 125 Options: s.HttpOptions, 126 } 127 128 if httpCli, httpErr = h2.NewHttp2Client(); httpErr != nil { 129 return errors.New("Connect to Textract Failed: (AWS Session Error) " + "Create Custom http2 Client Errored = " + httpErr.Error()) 130 } 131 132 // establish aws session connection 133 if cfg, err := config.LoadDefaultConfig(ctx, config.WithHTTPClient(httpCli)); err != nil { 134 // aws session error 135 return errors.New("Connect to Textract Failed: (AWS Session Error) " + err.Error()) 136 } else { 137 // create cached objects for shared use 138 s.textractClient = textract.NewFromConfig(cfg) 139 140 if s.textractClient == nil { 141 return errors.New("Connect to Textract Client Failed: (New Textract Client Connection) " + "Connection Object Nil") 142 } 143 144 // connect successful 145 return nil 146 } 147 } 148 149 // Disconnect will clear textract client 150 func (s *Textract) Disconnect() { 151 s.textractClient = nil 152 } 153 154 // UpdateParentSegment updates this struct's xray parent segment, if no parent segment, set nil 155 func (s *Textract) UpdateParentSegment(parentSegment *xray.XRayParentSegment) { 156 s._parentSegment = parentSegment 157 } 158 159 // ---------------------------------------------------------------------------------------------------------------- 160 // Analysis functions 161 // ---------------------------------------------------------------------------------------------------------------- 162 163 // Analyzes identity documents for relevant information. This information is 164 // extracted and returned as IdentityDocumentFields , which records both the 165 // normalized field and value of the extracted text. Unlike other Amazon Textract 166 // operations, AnalyzeID doesn't return any Geometry data. 167 func (s *Textract) AnalyzeID(data []byte) (doc *types.IdentityDocument, err error) { 168 segCtx := context.Background() 169 segCtxSet := false 170 171 seg := xray.NewSegmentNullable("Textract-AnalyzeID", s._parentSegment) 172 173 if seg != nil { 174 segCtx = seg.Ctx 175 segCtxSet = true 176 177 defer seg.Close() 178 defer func() { 179 _ = seg.Seg.AddMetadata("Textract-AnalyzeID-IdentityFields", doc) 180 181 if err != nil { 182 _ = seg.Seg.AddError(err) 183 } 184 }() 185 } 186 187 // validation 188 if s.textractClient == nil { 189 err = errors.New("AnalyzeID Failed: " + "Textract Client is Required") 190 return nil, err 191 } 192 193 if len(data) <= 0 { 194 err = errors.New("AnalyzeID Failed: " + "Document is Required") 195 return nil, err 196 } 197 198 // create input object 199 input := &textract.AnalyzeIDInput{ 200 DocumentPages: []types.Document{ 201 { 202 Bytes: data, 203 }, 204 }, 205 } 206 207 // perform action 208 var output *textract.AnalyzeIDOutput 209 210 if segCtxSet { 211 output, err = s.textractClient.AnalyzeID(segCtx, input) 212 } else { 213 output, err = s.textractClient.AnalyzeID(context.Background(), input) 214 } 215 216 // evaluate result 217 if err != nil { 218 return nil, err 219 } 220 if len(output.IdentityDocuments) == 0 { 221 return nil, errors.New("AnalyzeID Failed: " + "No Identity Documents Found") 222 } 223 224 return &output.IdentityDocuments[0], nil 225 } 226 227 // Detects text in the input document. Amazon Textract can detect lines of text 228 // and the words that make up a line of text. The input document must be in one of 229 // the following image formats: JPEG, PNG, PDF, or TIFF. DetectDocumentText 230 // returns the detected text in an array of Block objects. Each document page has 231 // as an associated Block of type PAGE. Each PAGE Block object is the parent of 232 // LINE Block objects that represent the lines of detected text on a page. A LINE 233 // Block object is a parent for each word that makes up the line. Words are 234 // represented by Block objects of type WORD. DetectDocumentText is a synchronous 235 // operation. To analyze documents asynchronously, use StartDocumentTextDetection . 236 // For more information, see Document Text Detection (https://docs.aws.amazon.com/textract/latest/dg/how-it-works-detecting.html) 237 // . 238 func (s *Textract) DetectDocumentText(data []byte) (blocks []types.Block, err error) { 239 segCtx := context.Background() 240 segCtxSet := false 241 242 seg := xray.NewSegmentNullable("Textract-DetectDocumentText", s._parentSegment) 243 244 if seg != nil { 245 segCtx = seg.Ctx 246 segCtxSet = true 247 248 defer seg.Close() 249 defer func() { 250 _ = seg.Seg.AddMetadata("Textract-DetectDocumentText-DetectedBlocks", blocks) 251 252 if err != nil { 253 _ = seg.Seg.AddError(err) 254 } 255 }() 256 } 257 258 // validation 259 if s.textractClient == nil { 260 err = errors.New("DetectDocumentText Failed: " + "Textract Client is Required") 261 return nil, err 262 } 263 264 if len(data) <= 0 { 265 err = errors.New("DetectDocumentText Failed: " + "Document is Required") 266 return nil, err 267 } 268 269 // create input object 270 input := &textract.DetectDocumentTextInput{ 271 Document: &types.Document{ 272 Bytes: data, 273 }, 274 } 275 276 // perform action 277 var output *textract.DetectDocumentTextOutput 278 279 if segCtxSet { 280 output, err = s.textractClient.DetectDocumentText(segCtx, input) 281 } else { 282 output, err = s.textractClient.DetectDocumentText(context.Background(), input) 283 } 284 285 // evaluate result 286 if err != nil { 287 return nil, err 288 } 289 if len(output.Blocks) == 0 { 290 return nil, errors.New("DetectDocumentText Failed: " + "No Blocks Detected") 291 } 292 293 return output.Blocks, nil 294 }