github.com/aldelo/common@v1.5.1/wrapper/textract/textract.go (about)

     1  package textract
     2  
     3  /*
     4   * Copyright 2020-2024 Aldelo, LP
     5   *
     6   * Licensed under the Apache License, Version 2.0 (the "License");
     7   * you may not use this file except in compliance with the License.
     8   * You may obtain a copy of the License at
     9   *
    10   *     http://www.apache.org/licenses/LICENSE-2.0
    11   *
    12   * Unless required by applicable law or agreed to in writing, software
    13   * distributed under the License is distributed on an "AS IS" BASIS,
    14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15   * See the License for the specific language governing permissions and
    16   * limitations under the License.
    17   */
    18  
    19  // =================================================================================================================
    20  // AWS CREDENTIAL:
    21  //		use $> aws configure (to set aws access key and secret to target machine)
    22  //		Store AWS Access ID and Secret Key into Default Profile Using '$ aws configure' cli
    23  //
    24  // To Install & Setup AWS CLI on Host:
    25  //		1) https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2-linux.html
    26  //				On Ubuntu, if host does not have zip and unzip:
    27  //					$> sudo apt install zip
    28  //					$> sudo apt install unzip
    29  //				On Ubuntu, to install AWS CLI v2:
    30  //					$> curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
    31  //					$> unzip awscliv2.zip
    32  //					$> sudo ./aws/install
    33  //		2) $> aws configure set region awsRegionName --profile default
    34  // 		3) $> aws configure
    35  //				follow prompts to enter Access ID and Secret Key
    36  //
    37  // AWS Region Name Reference:
    38  //		us-west-2, us-east-1, ap-northeast-1, etc
    39  //		See: https://docs.aws.amazon.com/general/latest/gr/rande.html
    40  // =================================================================================================================
    41  
    42  import (
    43  	"context"
    44  	"errors"
    45  	"net/http"
    46  
    47  	awshttp2 "github.com/aldelo/common/wrapper/aws"
    48  	"github.com/aldelo/common/wrapper/aws/awsregion"
    49  	"github.com/aldelo/common/wrapper/xray"
    50  	"github.com/aws/aws-sdk-go-v2/config"
    51  	"github.com/aws/aws-sdk-go-v2/service/textract"
    52  	"github.com/aws/aws-sdk-go-v2/service/textract/types"
    53  )
    54  
    55  // ================================================================================================================
    56  // STRUCTS
    57  // ================================================================================================================
    58  
    59  // Textract struct encapsulates the AWS Textract access functionality
    60  type Textract struct {
    61  	// define the AWS region that Textract is located at
    62  	AwsRegion awsregion.AWSRegion
    63  
    64  	// custom http2 client options
    65  	HttpOptions *awshttp2.HttpClientSettings
    66  
    67  	// store Textract client object
    68  	textractClient *textract.Client
    69  
    70  	_parentSegment *xray.XRayParentSegment
    71  }
    72  
    73  // ================================================================================================================
    74  // STRUCTS FUNCTIONS
    75  // ================================================================================================================
    76  
    77  // ----------------------------------------------------------------------------------------------------------------
    78  // utility functions
    79  // ----------------------------------------------------------------------------------------------------------------
    80  
    81  // Connect will establish a connection to the Textract service
    82  func (s *Textract) Connect(parentSegment ...*xray.XRayParentSegment) (err error) {
    83  	if xray.XRayServiceOn() {
    84  		if len(parentSegment) > 0 {
    85  			s._parentSegment = parentSegment[0]
    86  		}
    87  
    88  		seg := xray.NewSegment("Textract-Connect", s._parentSegment)
    89  		defer seg.Close()
    90  		defer func() {
    91  			_ = seg.Seg.AddMetadata("Textract-AWS-Region", s.AwsRegion)
    92  
    93  			if err != nil {
    94  				_ = seg.Seg.AddError(err)
    95  			}
    96  		}()
    97  
    98  		err = s.connectInternal(seg.Ctx)
    99  
   100  		return err
   101  	} else {
   102  		return s.connectInternal(context.Background())
   103  	}
   104  }
   105  
   106  // Connect will establish a connection to the Textract service
   107  func (s *Textract) connectInternal(ctx context.Context) error {
   108  	// clean up prior textract client reference
   109  	s.textractClient = nil
   110  
   111  	if !s.AwsRegion.Valid() || s.AwsRegion == awsregion.UNKNOWN {
   112  		return errors.New("Connect to Textract Failed: (AWS Session Error) " + "Region is Required")
   113  	}
   114  
   115  	// create custom http2 client if needed
   116  	var httpCli *http.Client
   117  	var httpErr error
   118  
   119  	if s.HttpOptions == nil {
   120  		s.HttpOptions = new(awshttp2.HttpClientSettings)
   121  	}
   122  
   123  	// use custom http2 client
   124  	h2 := &awshttp2.AwsHttp2Client{
   125  		Options: s.HttpOptions,
   126  	}
   127  
   128  	if httpCli, httpErr = h2.NewHttp2Client(); httpErr != nil {
   129  		return errors.New("Connect to Textract Failed: (AWS Session Error) " + "Create Custom http2 Client Errored = " + httpErr.Error())
   130  	}
   131  
   132  	// establish aws session connection
   133  	if cfg, err := config.LoadDefaultConfig(ctx, config.WithHTTPClient(httpCli)); err != nil {
   134  		// aws session error
   135  		return errors.New("Connect to Textract Failed: (AWS Session Error) " + err.Error())
   136  	} else {
   137  		// create cached objects for shared use
   138  		s.textractClient = textract.NewFromConfig(cfg)
   139  
   140  		if s.textractClient == nil {
   141  			return errors.New("Connect to Textract Client Failed: (New Textract Client Connection) " + "Connection Object Nil")
   142  		}
   143  
   144  		// connect successful
   145  		return nil
   146  	}
   147  }
   148  
   149  // Disconnect will clear textract client
   150  func (s *Textract) Disconnect() {
   151  	s.textractClient = nil
   152  }
   153  
   154  // UpdateParentSegment updates this struct's xray parent segment, if no parent segment, set nil
   155  func (s *Textract) UpdateParentSegment(parentSegment *xray.XRayParentSegment) {
   156  	s._parentSegment = parentSegment
   157  }
   158  
   159  // ----------------------------------------------------------------------------------------------------------------
   160  // Analysis functions
   161  // ----------------------------------------------------------------------------------------------------------------
   162  
   163  // Analyzes identity documents for relevant information. This information is
   164  // extracted and returned as IdentityDocumentFields , which records both the
   165  // normalized field and value of the extracted text. Unlike other Amazon Textract
   166  // operations, AnalyzeID doesn't return any Geometry data.
   167  func (s *Textract) AnalyzeID(data []byte) (doc *types.IdentityDocument, err error) {
   168  	segCtx := context.Background()
   169  	segCtxSet := false
   170  
   171  	seg := xray.NewSegmentNullable("Textract-AnalyzeID", s._parentSegment)
   172  
   173  	if seg != nil {
   174  		segCtx = seg.Ctx
   175  		segCtxSet = true
   176  
   177  		defer seg.Close()
   178  		defer func() {
   179  			_ = seg.Seg.AddMetadata("Textract-AnalyzeID-IdentityFields", doc)
   180  
   181  			if err != nil {
   182  				_ = seg.Seg.AddError(err)
   183  			}
   184  		}()
   185  	}
   186  
   187  	// validation
   188  	if s.textractClient == nil {
   189  		err = errors.New("AnalyzeID Failed: " + "Textract Client is Required")
   190  		return nil, err
   191  	}
   192  
   193  	if len(data) <= 0 {
   194  		err = errors.New("AnalyzeID Failed: " + "Document is Required")
   195  		return nil, err
   196  	}
   197  
   198  	// create input object
   199  	input := &textract.AnalyzeIDInput{
   200  		DocumentPages: []types.Document{
   201  			{
   202  				Bytes: data,
   203  			},
   204  		},
   205  	}
   206  
   207  	// perform action
   208  	var output *textract.AnalyzeIDOutput
   209  
   210  	if segCtxSet {
   211  		output, err = s.textractClient.AnalyzeID(segCtx, input)
   212  	} else {
   213  		output, err = s.textractClient.AnalyzeID(context.Background(), input)
   214  	}
   215  
   216  	// evaluate result
   217  	if err != nil {
   218  		return nil, err
   219  	}
   220  	if len(output.IdentityDocuments) == 0 {
   221  		return nil, errors.New("AnalyzeID Failed: " + "No Identity Documents Found")
   222  	}
   223  
   224  	return &output.IdentityDocuments[0], nil
   225  }
   226  
   227  // Detects text in the input document. Amazon Textract can detect lines of text
   228  // and the words that make up a line of text. The input document must be in one of
   229  // the following image formats: JPEG, PNG, PDF, or TIFF. DetectDocumentText
   230  // returns the detected text in an array of Block objects. Each document page has
   231  // as an associated Block of type PAGE. Each PAGE Block object is the parent of
   232  // LINE Block objects that represent the lines of detected text on a page. A LINE
   233  // Block object is a parent for each word that makes up the line. Words are
   234  // represented by Block objects of type WORD. DetectDocumentText is a synchronous
   235  // operation. To analyze documents asynchronously, use StartDocumentTextDetection .
   236  // For more information, see Document Text Detection (https://docs.aws.amazon.com/textract/latest/dg/how-it-works-detecting.html)
   237  // .
   238  func (s *Textract) DetectDocumentText(data []byte) (blocks []types.Block, err error) {
   239  	segCtx := context.Background()
   240  	segCtxSet := false
   241  
   242  	seg := xray.NewSegmentNullable("Textract-DetectDocumentText", s._parentSegment)
   243  
   244  	if seg != nil {
   245  		segCtx = seg.Ctx
   246  		segCtxSet = true
   247  
   248  		defer seg.Close()
   249  		defer func() {
   250  			_ = seg.Seg.AddMetadata("Textract-DetectDocumentText-DetectedBlocks", blocks)
   251  
   252  			if err != nil {
   253  				_ = seg.Seg.AddError(err)
   254  			}
   255  		}()
   256  	}
   257  
   258  	// validation
   259  	if s.textractClient == nil {
   260  		err = errors.New("DetectDocumentText Failed: " + "Textract Client is Required")
   261  		return nil, err
   262  	}
   263  
   264  	if len(data) <= 0 {
   265  		err = errors.New("DetectDocumentText Failed: " + "Document is Required")
   266  		return nil, err
   267  	}
   268  
   269  	// create input object
   270  	input := &textract.DetectDocumentTextInput{
   271  		Document: &types.Document{
   272  			Bytes: data,
   273  		},
   274  	}
   275  
   276  	// perform action
   277  	var output *textract.DetectDocumentTextOutput
   278  
   279  	if segCtxSet {
   280  		output, err = s.textractClient.DetectDocumentText(segCtx, input)
   281  	} else {
   282  		output, err = s.textractClient.DetectDocumentText(context.Background(), input)
   283  	}
   284  
   285  	// evaluate result
   286  	if err != nil {
   287  		return nil, err
   288  	}
   289  	if len(output.Blocks) == 0 {
   290  		return nil, errors.New("DetectDocumentText Failed: " + "No Blocks Detected")
   291  	}
   292  
   293  	return output.Blocks, nil
   294  }