github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/extractor/text.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package extractor
     7  
     8  import (
     9  	"bytes"
    10  	"errors"
    11  	"fmt"
    12  
    13  	"github.com/unidoc/unidoc/common"
    14  	"github.com/unidoc/unidoc/pdf/contentstream"
    15  	"github.com/unidoc/unidoc/pdf/core"
    16  	"github.com/unidoc/unidoc/pdf/internal/cmap"
    17  	"github.com/unidoc/unidoc/pdf/model"
    18  )
    19  
    20  // ExtractText processes and extracts all text data in content streams and returns as a string. Takes into
    21  // account character encoding via CMaps in the PDF file.
    22  // The text is processed linearly e.g. in the order in which it appears. A best effort is done to add
    23  // spaces and newlines.
    24  func (e *Extractor) ExtractText() (string, error) {
    25  	var buf bytes.Buffer
    26  
    27  	cstreamParser := contentstream.NewContentStreamParser(e.contents)
    28  	operations, err := cstreamParser.Parse()
    29  	if err != nil {
    30  		return buf.String(), err
    31  	}
    32  
    33  	processor := contentstream.NewContentStreamProcessor(*operations)
    34  
    35  	var codemap *cmap.CMap
    36  	inText := false
    37  	xPos, yPos := float64(-1), float64(-1)
    38  
    39  	processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
    40  		func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error {
    41  			operand := op.Operand
    42  			switch operand {
    43  			case "BT":
    44  				inText = true
    45  			case "ET":
    46  				inText = false
    47  			case "Tf":
    48  				if !inText {
    49  					common.Log.Debug("Tf operand outside text")
    50  					return nil
    51  				}
    52  
    53  				if len(op.Params) != 2 {
    54  					common.Log.Debug("Error Tf should only get 2 input params, got %d", len(op.Params))
    55  					return errors.New("Incorrect parameter count")
    56  				}
    57  
    58  				codemap = nil
    59  
    60  				fontName, ok := op.Params[0].(*core.PdfObjectName)
    61  				if !ok {
    62  					common.Log.Debug("Error Tf font input not a name")
    63  					return errors.New("Tf range error")
    64  				}
    65  
    66  				if resources == nil {
    67  					return nil
    68  				}
    69  
    70  				fontObj, found := resources.GetFontByName(*fontName)
    71  				if !found {
    72  					common.Log.Debug("Font not found...")
    73  					return errors.New("Font not in resources")
    74  				}
    75  
    76  				fontObj = core.TraceToDirectObject(fontObj)
    77  				if fontDict, isDict := fontObj.(*core.PdfObjectDictionary); isDict {
    78  					toUnicode := fontDict.Get("ToUnicode")
    79  					if toUnicode != nil {
    80  						toUnicode = core.TraceToDirectObject(toUnicode)
    81  						toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
    82  						if !ok {
    83  							return errors.New("Invalid ToUnicode entry - not a stream")
    84  						}
    85  						decoded, err := core.DecodeStream(toUnicodeStream)
    86  						if err != nil {
    87  							return err
    88  						}
    89  
    90  						codemap, err = cmap.LoadCmapFromData(decoded)
    91  						if err != nil {
    92  							return err
    93  						}
    94  					}
    95  				}
    96  			case "T*":
    97  				if !inText {
    98  					common.Log.Debug("T* operand outside text")
    99  					return nil
   100  				}
   101  				buf.WriteString("\n")
   102  			case "Td", "TD":
   103  				if !inText {
   104  					common.Log.Debug("Td/TD operand outside text")
   105  					return nil
   106  				}
   107  
   108  				// Params: [tx ty], corresponeds to Tm=Tlm=[1 0 0;0 1 0;tx ty 1]*Tm
   109  				if len(op.Params) != 2 {
   110  					common.Log.Debug("Td/TD invalid arguments")
   111  					return nil
   112  				}
   113  				tx, err := getNumberAsFloat(op.Params[0])
   114  				if err != nil {
   115  					common.Log.Debug("Td Float parse error")
   116  					return nil
   117  				}
   118  				ty, err := getNumberAsFloat(op.Params[1])
   119  				if err != nil {
   120  					common.Log.Debug("Td Float parse error")
   121  					return nil
   122  				}
   123  
   124  				if tx > 0 {
   125  					buf.WriteString(" ")
   126  				}
   127  				if ty < 0 {
   128  					// TODO: More flexible space characters?
   129  					buf.WriteString("\n")
   130  				}
   131  			case "Tm":
   132  				if !inText {
   133  					common.Log.Debug("Tm operand outside text")
   134  					return nil
   135  				}
   136  
   137  				// Params: a,b,c,d,e,f as in Tm = [a b 0; c d 0; e f 1].
   138  				// The last two (e,f) represent translation.
   139  				if len(op.Params) != 6 {
   140  					return errors.New("Tm: Invalid number of inputs")
   141  				}
   142  				xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
   143  				if !ok {
   144  					xint, ok := op.Params[4].(*core.PdfObjectInteger)
   145  					if !ok {
   146  						return nil
   147  					}
   148  					xfloat = core.MakeFloat(float64(*xint))
   149  				}
   150  				yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
   151  				if !ok {
   152  					yint, ok := op.Params[5].(*core.PdfObjectInteger)
   153  					if !ok {
   154  						return nil
   155  					}
   156  					yfloat = core.MakeFloat(float64(*yint))
   157  				}
   158  				if yPos == -1 {
   159  					yPos = float64(*yfloat)
   160  				} else if yPos > float64(*yfloat) {
   161  					buf.WriteString("\n")
   162  					xPos = float64(*xfloat)
   163  					yPos = float64(*yfloat)
   164  					return nil
   165  				}
   166  				if xPos == -1 {
   167  					xPos = float64(*xfloat)
   168  				} else if xPos < float64(*xfloat) {
   169  					buf.WriteString("\t")
   170  					xPos = float64(*xfloat)
   171  				}
   172  			case "TJ":
   173  				if !inText {
   174  					common.Log.Debug("TJ operand outside text")
   175  					return nil
   176  				}
   177  				if len(op.Params) < 1 {
   178  					return nil
   179  				}
   180  				paramList, ok := op.Params[0].(*core.PdfObjectArray)
   181  				if !ok {
   182  					return fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
   183  				}
   184  				for _, obj := range *paramList {
   185  					switch v := obj.(type) {
   186  					case *core.PdfObjectString:
   187  						if codemap != nil {
   188  							buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*v)))
   189  						} else {
   190  							buf.WriteString(string(*v))
   191  						}
   192  					case *core.PdfObjectFloat:
   193  						if *v < -100 {
   194  							buf.WriteString(" ")
   195  						}
   196  					case *core.PdfObjectInteger:
   197  						if *v < -100 {
   198  							buf.WriteString(" ")
   199  						}
   200  					}
   201  				}
   202  			case "Tj":
   203  				if !inText {
   204  					common.Log.Debug("Tj operand outside text")
   205  					return nil
   206  				}
   207  				if len(op.Params) < 1 {
   208  					return nil
   209  				}
   210  				param, ok := op.Params[0].(*core.PdfObjectString)
   211  				if !ok {
   212  					return fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
   213  				}
   214  				if codemap != nil {
   215  					buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*param)))
   216  				} else {
   217  					buf.WriteString(string(*param))
   218  				}
   219  			}
   220  
   221  			return nil
   222  		})
   223  
   224  	err = processor.Process(e.resources)
   225  	if err != nil {
   226  		common.Log.Error("Error processing: %v", err)
   227  		return buf.String(), err
   228  	}
   229  
   230  	procBuf(&buf)
   231  
   232  	return buf.String(), nil
   233  }