github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/contentstream/contentstream.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package contentstream
     7  
     8  import (
     9  	"bytes"
    10  	"fmt"
    11  
    12  	. "github.com/unidoc/unidoc/pdf/core"
    13  )
    14  
    15  type ContentStreamOperation struct {
    16  	Params  []PdfObject
    17  	Operand string
    18  }
    19  
    20  type ContentStreamOperations []*ContentStreamOperation
    21  
    22  // Check if the content stream operations are fully wrapped (within q ... Q)
    23  func (this *ContentStreamOperations) isWrapped() bool {
    24  	if len(*this) < 2 {
    25  		return false
    26  	}
    27  
    28  	depth := 0
    29  	for _, op := range *this {
    30  		if op.Operand == "q" {
    31  			depth++
    32  		} else if op.Operand == "Q" {
    33  			depth--
    34  		} else {
    35  			if depth < 1 {
    36  				return false
    37  			}
    38  		}
    39  	}
    40  
    41  	// Should end at depth == 0
    42  	return depth == 0
    43  }
    44  
    45  // Wrap entire contents within q ... Q.  If unbalanced, then adds extra Qs at the end.
    46  // Only does if needed. Ensures that when adding new content, one start with all states
    47  // in the default condition.
    48  func (this *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations {
    49  	if len(*this) == 0 {
    50  		// No need to wrap if empty.
    51  		return this
    52  	}
    53  	if this.isWrapped() {
    54  		return this
    55  	}
    56  
    57  	*this = append([]*ContentStreamOperation{{Operand: "q"}}, *this...)
    58  
    59  	depth := 0
    60  	for _, op := range *this {
    61  		if op.Operand == "q" {
    62  			depth++
    63  		} else if op.Operand == "Q" {
    64  			depth--
    65  		}
    66  	}
    67  
    68  	for depth > 0 {
    69  		*this = append(*this, &ContentStreamOperation{Operand: "Q"})
    70  		depth--
    71  	}
    72  
    73  	return this
    74  }
    75  
    76  // Convert a set of content stream operations to a content stream byte presentation, i.e. the kind that can be
    77  // stored as a PDF stream or string format.
    78  func (this *ContentStreamOperations) Bytes() []byte {
    79  	var buf bytes.Buffer
    80  
    81  	for _, op := range *this {
    82  		if op == nil {
    83  			continue
    84  		}
    85  
    86  		if op.Operand == "BI" {
    87  			// Inline image requires special handling.
    88  			buf.WriteString(op.Operand + "\n")
    89  			buf.WriteString(op.Params[0].DefaultWriteString())
    90  
    91  		} else {
    92  			// Default handler.
    93  			for _, param := range op.Params {
    94  				buf.WriteString(param.DefaultWriteString())
    95  				buf.WriteString(" ")
    96  
    97  			}
    98  
    99  			buf.WriteString(op.Operand + "\n")
   100  		}
   101  	}
   102  
   103  	return buf.Bytes()
   104  }
   105  
   106  // ExtractText parses and extracts all text data in content streams and returns as a string.
   107  // Does not take into account Encoding table, the output is simply the character codes.
   108  //
   109  // Deprecated: More advanced text extraction is offered in package extractor with character encoding support.
   110  func (this *ContentStreamParser) ExtractText() (string, error) {
   111  	operations, err := this.Parse()
   112  	if err != nil {
   113  		return "", err
   114  	}
   115  	inText := false
   116  	xPos, yPos := float64(-1), float64(-1)
   117  	txt := ""
   118  	for _, op := range *operations {
   119  		if op.Operand == "BT" {
   120  			inText = true
   121  		} else if op.Operand == "ET" {
   122  			inText = false
   123  		}
   124  		if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
   125  			// Move to next line...
   126  			txt += "\n"
   127  		}
   128  		if op.Operand == "Tm" {
   129  			if len(op.Params) != 6 {
   130  				continue
   131  			}
   132  			xfloat, ok := op.Params[4].(*PdfObjectFloat)
   133  			if !ok {
   134  				xint, ok := op.Params[4].(*PdfObjectInteger)
   135  				if !ok {
   136  					continue
   137  				}
   138  				xfloat = MakeFloat(float64(*xint))
   139  			}
   140  			yfloat, ok := op.Params[5].(*PdfObjectFloat)
   141  			if !ok {
   142  				yint, ok := op.Params[5].(*PdfObjectInteger)
   143  				if !ok {
   144  					continue
   145  				}
   146  				yfloat = MakeFloat(float64(*yint))
   147  			}
   148  			if yPos == -1 {
   149  				yPos = float64(*yfloat)
   150  			} else if yPos > float64(*yfloat) {
   151  				txt += "\n"
   152  				xPos = float64(*xfloat)
   153  				yPos = float64(*yfloat)
   154  				continue
   155  			}
   156  			if xPos == -1 {
   157  				xPos = float64(*xfloat)
   158  			} else if xPos < float64(*xfloat) {
   159  				txt += "\t"
   160  				xPos = float64(*xfloat)
   161  			}
   162  		}
   163  		if inText && op.Operand == "TJ" {
   164  			if len(op.Params) < 1 {
   165  				continue
   166  			}
   167  			paramList, ok := op.Params[0].(*PdfObjectArray)
   168  			if !ok {
   169  				return "", fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
   170  			}
   171  			for _, obj := range *paramList {
   172  				switch v := obj.(type) {
   173  				case *PdfObjectString:
   174  					txt += string(*v)
   175  				case *PdfObjectFloat:
   176  					if *v < -100 {
   177  						txt += " "
   178  					}
   179  				case *PdfObjectInteger:
   180  					if *v < -100 {
   181  						txt += " "
   182  					}
   183  				}
   184  			}
   185  		} else if inText && op.Operand == "Tj" {
   186  			if len(op.Params) < 1 {
   187  				continue
   188  			}
   189  			param, ok := op.Params[0].(*PdfObjectString)
   190  			if !ok {
   191  				return "", fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
   192  			}
   193  			txt += string(*param)
   194  		}
   195  	}
   196  
   197  	return txt, nil
   198  }