github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/contentstream/parser.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package contentstream
     7  
     8  import (
     9  	"bufio"
    10  	"bytes"
    11  	"encoding/hex"
    12  	"errors"
    13  	"fmt"
    14  	"io"
    15  	"strconv"
    16  
    17  	"github.com/unidoc/unidoc/common"
    18  	. "github.com/unidoc/unidoc/pdf/core"
    19  )
    20  
    21  // Content stream parser.
    22  type ContentStreamParser struct {
    23  	reader *bufio.Reader
    24  }
    25  
    26  // Create a new instance of the content stream parser from an input content
    27  // stream string.
    28  func NewContentStreamParser(contentStr string) *ContentStreamParser {
    29  	// Each command has parameters and an operand (command).
    30  	parser := ContentStreamParser{}
    31  
    32  	buffer := bytes.NewBufferString(contentStr + "\n") // Add newline at end to get last operand without EOF error.
    33  	parser.reader = bufio.NewReader(buffer)
    34  
    35  	return &parser
    36  }
    37  
    38  // Parses all commands in content stream, returning a list of operation data.
    39  func (this *ContentStreamParser) Parse() (*ContentStreamOperations, error) {
    40  	operations := ContentStreamOperations{}
    41  
    42  	for {
    43  		operation := ContentStreamOperation{}
    44  
    45  		for {
    46  			obj, err, isOperand := this.parseObject()
    47  			if err != nil {
    48  				if err == io.EOF {
    49  					// End of data. Successful exit point.
    50  					return &operations, nil
    51  				}
    52  				return &operations, err
    53  			}
    54  			if isOperand {
    55  				operation.Operand = string(*obj.(*PdfObjectString))
    56  				operations = append(operations, &operation)
    57  				break
    58  			} else {
    59  				operation.Params = append(operation.Params, obj)
    60  			}
    61  		}
    62  
    63  		if operation.Operand == "BI" {
    64  			// Parse an inline image, reads everything between the "BI" and "EI".
    65  			// The image is stored as the parameter.
    66  			im, err := this.ParseInlineImage()
    67  			if err != nil {
    68  				return &operations, err
    69  			}
    70  			operation.Params = append(operation.Params, im)
    71  		}
    72  	}
    73  }
    74  
    75  // Skip over any spaces.  Returns the number of spaces skipped and
    76  // an error if any.
    77  func (this *ContentStreamParser) skipSpaces() (int, error) {
    78  	cnt := 0
    79  	for {
    80  		bb, err := this.reader.Peek(1)
    81  		if err != nil {
    82  			return 0, err
    83  		}
    84  		if IsWhiteSpace(bb[0]) {
    85  			this.reader.ReadByte()
    86  			cnt++
    87  		} else {
    88  			break
    89  		}
    90  	}
    91  
    92  	return cnt, nil
    93  }
    94  
    95  // Skip over comments and spaces. Can handle multi-line comments.
    96  func (this *ContentStreamParser) skipComments() error {
    97  	if _, err := this.skipSpaces(); err != nil {
    98  		return err
    99  	}
   100  
   101  	isFirst := true
   102  	for {
   103  		bb, err := this.reader.Peek(1)
   104  		if err != nil {
   105  			common.Log.Debug("Error %s", err.Error())
   106  			return err
   107  		}
   108  		if isFirst && bb[0] != '%' {
   109  			// Not a comment clearly.
   110  			return nil
   111  		} else {
   112  			isFirst = false
   113  		}
   114  		if (bb[0] != '\r') && (bb[0] != '\n') {
   115  			this.reader.ReadByte()
   116  		} else {
   117  			break
   118  		}
   119  	}
   120  
   121  	// Call recursively to handle multiline comments.
   122  	return this.skipComments()
   123  }
   124  
   125  // Parse a name starting with '/'.
   126  func (this *ContentStreamParser) parseName() (PdfObjectName, error) {
   127  	name := ""
   128  	nameStarted := false
   129  	for {
   130  		bb, err := this.reader.Peek(1)
   131  		if err == io.EOF {
   132  			break // Can happen when loading from object stream.
   133  		}
   134  		if err != nil {
   135  			return PdfObjectName(name), err
   136  		}
   137  
   138  		if !nameStarted {
   139  			// Should always start with '/', otherwise not valid.
   140  			if bb[0] == '/' {
   141  				nameStarted = true
   142  				this.reader.ReadByte()
   143  			} else {
   144  				common.Log.Error("Name starting with %s (% x)", bb, bb)
   145  				return PdfObjectName(name), fmt.Errorf("Invalid name: (%c)", bb[0])
   146  			}
   147  		} else {
   148  			if IsWhiteSpace(bb[0]) {
   149  				break
   150  			} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
   151  				break // Looks like start of next statement.
   152  			} else if bb[0] == '#' {
   153  				hexcode, err := this.reader.Peek(3)
   154  				if err != nil {
   155  					return PdfObjectName(name), err
   156  				}
   157  				this.reader.Discard(3)
   158  
   159  				code, err := hex.DecodeString(string(hexcode[1:3]))
   160  				if err != nil {
   161  					return PdfObjectName(name), err
   162  				}
   163  				name += string(code)
   164  			} else {
   165  				b, _ := this.reader.ReadByte()
   166  				name += string(b)
   167  			}
   168  		}
   169  	}
   170  	return PdfObjectName(name), nil
   171  }
   172  
   173  // Numeric objects.
   174  // Section 7.3.3.
   175  // Integer or Float.
   176  //
   177  // An integer shall be written as one or more decimal digits optionally
   178  // preceded by a sign. The value shall be interpreted as a signed
   179  // decimal integer and shall be converted to an integer object.
   180  //
   181  // A real value shall be written as one or more decimal digits with an
   182  // optional sign and a leading, trailing, or embedded PERIOD (2Eh)
   183  // (decimal point). The value shall be interpreted as a real number
   184  // and shall be converted to a real object.
   185  //
   186  // Regarding exponential numbers: 7.3.3 Numeric Objects:
   187  // A conforming writer shall not use the PostScript syntax for numbers
   188  // with non-decimal radices (such as 16#FFFE) or in exponential format
   189  // (such as 6.02E23).
   190  // Nonetheless, we sometimes get numbers with exponential format, so
   191  // we will support it in the reader (no confusion with other types, so
   192  // no compromise).
   193  func (this *ContentStreamParser) parseNumber() (PdfObject, error) {
   194  	isFloat := false
   195  	allowSigns := true
   196  	numStr := ""
   197  	for {
   198  		common.Log.Trace("Parsing number \"%s\"", numStr)
   199  		bb, err := this.reader.Peek(1)
   200  		if err == io.EOF {
   201  			// GH: EOF handling.  Handle EOF like end of line.  Can happen with
   202  			// encoded object streams that the object is at the end.
   203  			// In other cases, we will get the EOF error elsewhere at any rate.
   204  			break // Handle like EOF
   205  		}
   206  		if err != nil {
   207  			common.Log.Error("ERROR %s", err)
   208  			return nil, err
   209  		}
   210  		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
   211  			// Only appear in the beginning, otherwise serves as a delimiter.
   212  			b, _ := this.reader.ReadByte()
   213  			numStr += string(b)
   214  			allowSigns = false // Only allowed in beginning, and after e (exponential).
   215  		} else if IsDecimalDigit(bb[0]) {
   216  			b, _ := this.reader.ReadByte()
   217  			numStr += string(b)
   218  		} else if bb[0] == '.' {
   219  			b, _ := this.reader.ReadByte()
   220  			numStr += string(b)
   221  			isFloat = true
   222  		} else if bb[0] == 'e' {
   223  			// Exponential number format.
   224  			b, _ := this.reader.ReadByte()
   225  			numStr += string(b)
   226  			isFloat = true
   227  			allowSigns = true
   228  		} else {
   229  			break
   230  		}
   231  	}
   232  
   233  	if isFloat {
   234  		fVal, err := strconv.ParseFloat(numStr, 64)
   235  		if err != nil {
   236  			common.Log.Debug("Error parsing number %q err=%v. Using 0.0. Output may be incorrect", numStr, err)
   237  			fVal = 0.0
   238  			err = nil
   239  		}
   240  		o := PdfObjectFloat(fVal)
   241  		return &o, err
   242  	} else {
   243  		intVal, err := strconv.ParseInt(numStr, 10, 64)
   244  		if err != nil {
   245  			common.Log.Debug("Error parsing integer %q err=%v. Using 0. Output may be incorrect", numStr, err)
   246  			intVal = 0
   247  			err = nil
   248  		}
   249  		o := PdfObjectInteger(intVal)
   250  		return &o, err
   251  	}
   252  }
   253  
   254  // A string starts with '(' and ends with ')'.
   255  func (this *ContentStreamParser) parseString() (PdfObjectString, error) {
   256  	this.reader.ReadByte()
   257  
   258  	bytes := []byte{}
   259  	count := 1
   260  	for {
   261  		bb, err := this.reader.Peek(1)
   262  		if err != nil {
   263  			return PdfObjectString(bytes), err
   264  		}
   265  
   266  		if bb[0] == '\\' { // Escape sequence.
   267  			this.reader.ReadByte() // Skip the escape \ byte.
   268  			b, err := this.reader.ReadByte()
   269  			if err != nil {
   270  				return PdfObjectString(bytes), err
   271  			}
   272  
   273  			// Octal '\ddd' number (base 8).
   274  			if IsOctalDigit(b) {
   275  				bb, err := this.reader.Peek(2)
   276  				if err != nil {
   277  					return PdfObjectString(bytes), err
   278  				}
   279  
   280  				numeric := []byte{}
   281  				numeric = append(numeric, b)
   282  				for _, val := range bb {
   283  					if IsOctalDigit(val) {
   284  						numeric = append(numeric, val)
   285  					} else {
   286  						break
   287  					}
   288  				}
   289  				this.reader.Discard(len(numeric) - 1)
   290  
   291  				common.Log.Trace("Numeric string \"%s\"", numeric)
   292  				code, err := strconv.ParseUint(string(numeric), 8, 32)
   293  				if err != nil {
   294  					return PdfObjectString(bytes), err
   295  				}
   296  				bytes = append(bytes, byte(code))
   297  				continue
   298  			}
   299  
   300  			switch b {
   301  			case 'n':
   302  				bytes = append(bytes, '\n')
   303  			case 'r':
   304  				bytes = append(bytes, '\r')
   305  			case 't':
   306  				bytes = append(bytes, '\t')
   307  			case 'b':
   308  				bytes = append(bytes, '\b')
   309  			case 'f':
   310  				bytes = append(bytes, '\f')
   311  			case '(':
   312  				bytes = append(bytes, '(')
   313  			case ')':
   314  				bytes = append(bytes, ')')
   315  			case '\\':
   316  				bytes = append(bytes, '\\')
   317  			}
   318  
   319  			continue
   320  		} else if bb[0] == '(' {
   321  			count++
   322  		} else if bb[0] == ')' {
   323  			count--
   324  			if count == 0 {
   325  				this.reader.ReadByte()
   326  				break
   327  			}
   328  		}
   329  
   330  		b, _ := this.reader.ReadByte()
   331  		bytes = append(bytes, b)
   332  	}
   333  
   334  	return PdfObjectString(bytes), nil
   335  }
   336  
   337  // Starts with '<' ends with '>'.
   338  func (this *ContentStreamParser) parseHexString() (PdfObjectString, error) {
   339  	this.reader.ReadByte()
   340  
   341  	hextable := []byte("0123456789abcdefABCDEF")
   342  
   343  	tmp := []byte{}
   344  	for {
   345  		this.skipSpaces()
   346  
   347  		bb, err := this.reader.Peek(1)
   348  		if err != nil {
   349  			return PdfObjectString(""), err
   350  		}
   351  
   352  		if bb[0] == '>' {
   353  			this.reader.ReadByte()
   354  			break
   355  		}
   356  
   357  		b, _ := this.reader.ReadByte()
   358  		if bytes.IndexByte(hextable, b) >= 0 {
   359  			tmp = append(tmp, b)
   360  		}
   361  	}
   362  
   363  	if len(tmp)%2 == 1 {
   364  		tmp = append(tmp, '0')
   365  	}
   366  
   367  	buf, _ := hex.DecodeString(string(tmp))
   368  	return PdfObjectString(buf), nil
   369  }
   370  
   371  // Starts with '[' ends with ']'.  Can contain any kinds of direct objects.
   372  func (this *ContentStreamParser) parseArray() (PdfObjectArray, error) {
   373  	arr := make(PdfObjectArray, 0)
   374  
   375  	this.reader.ReadByte()
   376  
   377  	for {
   378  		this.skipSpaces()
   379  
   380  		bb, err := this.reader.Peek(1)
   381  		if err != nil {
   382  			return arr, err
   383  		}
   384  
   385  		if bb[0] == ']' {
   386  			this.reader.ReadByte()
   387  			break
   388  		}
   389  
   390  		obj, err, _ := this.parseObject()
   391  		if err != nil {
   392  			return arr, err
   393  		}
   394  		arr = append(arr, obj)
   395  	}
   396  
   397  	return arr, nil
   398  }
   399  
   400  // Parse bool object.
   401  func (this *ContentStreamParser) parseBool() (PdfObjectBool, error) {
   402  	bb, err := this.reader.Peek(4)
   403  	if err != nil {
   404  		return PdfObjectBool(false), err
   405  	}
   406  	if (len(bb) >= 4) && (string(bb[:4]) == "true") {
   407  		this.reader.Discard(4)
   408  		return PdfObjectBool(true), nil
   409  	}
   410  
   411  	bb, err = this.reader.Peek(5)
   412  	if err != nil {
   413  		return PdfObjectBool(false), err
   414  	}
   415  	if (len(bb) >= 5) && (string(bb[:5]) == "false") {
   416  		this.reader.Discard(5)
   417  		return PdfObjectBool(false), nil
   418  	}
   419  
   420  	return PdfObjectBool(false), errors.New("Unexpected boolean string")
   421  }
   422  
   423  // Parse null object.
   424  func (this *ContentStreamParser) parseNull() (PdfObjectNull, error) {
   425  	_, err := this.reader.Discard(4)
   426  	return PdfObjectNull{}, err
   427  }
   428  
   429  func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) {
   430  	common.Log.Trace("Reading content stream dict!")
   431  
   432  	dict := MakeDict()
   433  
   434  	// Pass the '<<'
   435  	c, _ := this.reader.ReadByte()
   436  	if c != '<' {
   437  		return nil, errors.New("Invalid dict")
   438  	}
   439  	c, _ = this.reader.ReadByte()
   440  	if c != '<' {
   441  		return nil, errors.New("Invalid dict")
   442  	}
   443  
   444  	for {
   445  		this.skipSpaces()
   446  
   447  		bb, err := this.reader.Peek(2)
   448  		if err != nil {
   449  			return nil, err
   450  		}
   451  
   452  		common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))
   453  		if (bb[0] == '>') && (bb[1] == '>') {
   454  			common.Log.Trace("EOF dictionary")
   455  			this.reader.ReadByte()
   456  			this.reader.ReadByte()
   457  			break
   458  		}
   459  		common.Log.Trace("Parse the name!")
   460  
   461  		keyName, err := this.parseName()
   462  		common.Log.Trace("Key: %s", keyName)
   463  		if err != nil {
   464  			common.Log.Debug("ERROR Returning name err %s", err)
   465  			return nil, err
   466  		}
   467  
   468  		if len(keyName) > 4 && keyName[len(keyName)-4:] == "null" {
   469  			// Some writers have a bug where the null is appended without
   470  			// space.  For example "\Boundsnull"
   471  			newKey := keyName[0 : len(keyName)-4]
   472  			common.Log.Trace("Taking care of null bug (%s)", keyName)
   473  			common.Log.Trace("New key \"%s\" = null", newKey)
   474  			this.skipSpaces()
   475  			bb, _ := this.reader.Peek(1)
   476  			if bb[0] == '/' {
   477  				dict.Set(newKey, MakeNull())
   478  				continue
   479  			}
   480  		}
   481  
   482  		this.skipSpaces()
   483  
   484  		val, err, _ := this.parseObject()
   485  		if err != nil {
   486  			return nil, err
   487  		}
   488  		dict.Set(keyName, val)
   489  
   490  		common.Log.Trace("dict[%s] = %s", keyName, val.String())
   491  	}
   492  
   493  	return dict, nil
   494  }
   495  
   496  // An operand is a text command represented by a word.
   497  func (this *ContentStreamParser) parseOperand() (PdfObjectString, error) {
   498  	bytes := []byte{}
   499  	for {
   500  		bb, err := this.reader.Peek(1)
   501  		if err != nil {
   502  			return PdfObjectString(bytes), err
   503  		}
   504  		if IsDelimiter(bb[0]) {
   505  			break
   506  		}
   507  		if IsWhiteSpace(bb[0]) {
   508  			break
   509  		}
   510  
   511  		b, _ := this.reader.ReadByte()
   512  		bytes = append(bytes, b)
   513  	}
   514  
   515  	return PdfObjectString(bytes), nil
   516  }
   517  
   518  // Parse a generic object.  Returns the object, an error code, and a bool
   519  // value indicating whether the object is an operand.  An operand
   520  // is contained in a pdf string object.
   521  func (this *ContentStreamParser) parseObject() (PdfObject, error, bool) {
   522  	// Determine the kind of object.
   523  	// parse it!
   524  	// make a list of operands, then once operand arrives put into a package.
   525  
   526  	this.skipSpaces()
   527  	for {
   528  		bb, err := this.reader.Peek(2)
   529  		if err != nil {
   530  			return nil, err, false
   531  		}
   532  
   533  		common.Log.Trace("Peek string: %s", string(bb))
   534  		// Determine type.
   535  		if bb[0] == '%' {
   536  			this.skipComments()
   537  			continue
   538  		} else if bb[0] == '/' {
   539  			name, err := this.parseName()
   540  			common.Log.Trace("->Name: '%s'", name)
   541  			return &name, err, false
   542  		} else if bb[0] == '(' {
   543  			common.Log.Trace("->String!")
   544  			str, err := this.parseString()
   545  			common.Log.Trace("(%s)\n", str.String())
   546  			return &str, err, false
   547  		} else if bb[0] == '<' && bb[1] != '<' {
   548  			common.Log.Trace("->Hex String!")
   549  			str, err := this.parseHexString()
   550  			return &str, err, false
   551  		} else if bb[0] == '[' {
   552  			common.Log.Trace("->Array!")
   553  			arr, err := this.parseArray()
   554  			return &arr, err, false
   555  		} else if IsFloatDigit(bb[0]) || (bb[0] == '-' && IsFloatDigit(bb[1])) {
   556  			common.Log.Trace("->Number!")
   557  			number, err := this.parseNumber()
   558  			return number, err, false
   559  		} else if bb[0] == '<' && bb[1] == '<' {
   560  			dict, err := this.parseDict()
   561  			return dict, err, false
   562  		} else {
   563  			// Otherwise, can be: keyword such as "null", "false", "true" or an operand...
   564  			common.Log.Trace("->Operand or bool?")
   565  			// Let's peek farther to find out.
   566  			bb, _ = this.reader.Peek(5)
   567  			peekStr := string(bb)
   568  			common.Log.Trace("cont Peek str: %s", peekStr)
   569  
   570  			if (len(peekStr) > 3) && (peekStr[:4] == "null") {
   571  				null, err := this.parseNull()
   572  				return &null, err, false
   573  			} else if (len(peekStr) > 4) && (peekStr[:5] == "false") {
   574  				b, err := this.parseBool()
   575  				return &b, err, false
   576  			} else if (len(peekStr) > 3) && (peekStr[:4] == "true") {
   577  				b, err := this.parseBool()
   578  				return &b, err, false
   579  			}
   580  
   581  			operand, err := this.parseOperand()
   582  			if err != nil {
   583  				return &operand, err, false
   584  			}
   585  			if len(operand.String()) < 1 {
   586  				return &operand, ErrInvalidOperand, false
   587  			}
   588  			return &operand, nil, true
   589  		}
   590  	}
   591  }