github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/internal/cmap/parser.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package cmap
     7  
     8  import (
     9  	"bufio"
    10  	"bytes"
    11  	"errors"
    12  	"fmt"
    13  	"io"
    14  	"strconv"
    15  
    16  	"encoding/hex"
    17  
    18  	"github.com/unidoc/unidoc/common"
    19  	"github.com/unidoc/unidoc/pdf/core"
    20  )
    21  
    22  // cMapParser parses CMap character to unicode mapping files.
    23  type cMapParser struct {
    24  	reader *bufio.Reader
    25  }
    26  
    27  // cMapParser creates a new instance of the PDF CMap parser from input data.
    28  func newCMapParser(content []byte) *cMapParser {
    29  	parser := cMapParser{}
    30  
    31  	buffer := bytes.NewBuffer(content)
    32  	parser.reader = bufio.NewReader(buffer)
    33  
    34  	return &parser
    35  }
    36  
    37  // Detect the signature at the current file position and parse
    38  // the corresponding object.
    39  func (p *cMapParser) parseObject() (cmapObject, error) {
    40  	p.skipSpaces()
    41  	for {
    42  		bb, err := p.reader.Peek(2)
    43  		if err != nil {
    44  			return nil, err
    45  		}
    46  
    47  		if bb[0] == '%' {
    48  			p.parseComment()
    49  			p.skipSpaces()
    50  			continue
    51  		} else if bb[0] == '/' {
    52  			name, err := p.parseName()
    53  			return name, err
    54  		} else if bb[0] == '(' {
    55  			str, err := p.parseString()
    56  			return str, err
    57  		} else if bb[0] == '[' {
    58  			arr, err := p.parseArray()
    59  			return arr, err
    60  		} else if (bb[0] == '<') && (bb[1] == '<') {
    61  			dict, err := p.parseDict()
    62  			return dict, err
    63  		} else if bb[0] == '<' {
    64  			shex, err := p.parseHexString()
    65  			return shex, err
    66  		} else if core.IsDecimalDigit(bb[0]) || (bb[0] == '-' && core.IsDecimalDigit(bb[1])) {
    67  			number, err := p.parseNumber()
    68  			if err != nil {
    69  				return nil, err
    70  			}
    71  			return number, nil
    72  		} else {
    73  			// Operand?
    74  			operand, err := p.parseOperand()
    75  			if err != nil {
    76  				return nil, err
    77  			}
    78  
    79  			return operand, nil
    80  		}
    81  	}
    82  }
    83  
    84  // Skip over any spaces.  Returns the number of spaces skipped and
    85  // an error if any.
    86  func (p *cMapParser) skipSpaces() (int, error) {
    87  	cnt := 0
    88  	for {
    89  		bb, err := p.reader.Peek(1)
    90  		if err != nil {
    91  			return 0, err
    92  		}
    93  		if core.IsWhiteSpace(bb[0]) {
    94  			p.reader.ReadByte()
    95  			cnt++
    96  		} else {
    97  			break
    98  		}
    99  	}
   100  
   101  	return cnt, nil
   102  }
   103  
   104  // parseComment reads a comment line starting with '%'.
   105  func (p *cMapParser) parseComment() (string, error) {
   106  	var r bytes.Buffer
   107  
   108  	_, err := p.skipSpaces()
   109  	if err != nil {
   110  		return r.String(), err
   111  	}
   112  
   113  	isFirst := true
   114  	for {
   115  		bb, err := p.reader.Peek(1)
   116  		if err != nil {
   117  			common.Log.Debug("Error %s", err.Error())
   118  			return r.String(), err
   119  		}
   120  		if isFirst && bb[0] != '%' {
   121  			return r.String(), errors.New("Comment should start with %")
   122  		}
   123  		isFirst = false
   124  		if (bb[0] != '\r') && (bb[0] != '\n') {
   125  			b, _ := p.reader.ReadByte()
   126  			r.WriteByte(b)
   127  		} else {
   128  			break
   129  		}
   130  	}
   131  	return r.String(), nil
   132  }
   133  
   134  // Parse a name starting with '/'.
   135  func (p *cMapParser) parseName() (cmapName, error) {
   136  	name := ""
   137  	nameStarted := false
   138  	for {
   139  		bb, err := p.reader.Peek(1)
   140  		if err == io.EOF {
   141  			break // Can happen when loading from object stream.
   142  		}
   143  		if err != nil {
   144  			return cmapName{name}, err
   145  		}
   146  
   147  		if !nameStarted {
   148  			// Should always start with '/', otherwise not valid.
   149  			if bb[0] == '/' {
   150  				nameStarted = true
   151  				p.reader.ReadByte()
   152  			} else {
   153  				common.Log.Debug("ERROR Name starting with %s (% x)", bb, bb)
   154  				return cmapName{name}, fmt.Errorf("Invalid name: (%c)", bb[0])
   155  			}
   156  		} else {
   157  			if core.IsWhiteSpace(bb[0]) {
   158  				break
   159  			} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
   160  				break // Looks like start of next statement.
   161  			} else if bb[0] == '#' {
   162  				hexcode, err := p.reader.Peek(3)
   163  				if err != nil {
   164  					return cmapName{name}, err
   165  				}
   166  				p.reader.Discard(3)
   167  
   168  				code, err := hex.DecodeString(string(hexcode[1:3]))
   169  				if err != nil {
   170  					return cmapName{name}, err
   171  				}
   172  				name += string(code)
   173  			} else {
   174  				b, _ := p.reader.ReadByte()
   175  				name += string(b)
   176  			}
   177  		}
   178  	}
   179  
   180  	return cmapName{name}, nil
   181  }
   182  
   183  // A string starts with '(' and ends with ')'.
   184  func (p *cMapParser) parseString() (cmapString, error) {
   185  	p.reader.ReadByte()
   186  
   187  	buf := bytes.Buffer{}
   188  
   189  	count := 1
   190  	for {
   191  		bb, err := p.reader.Peek(1)
   192  		if err != nil {
   193  			return cmapString{buf.String()}, err
   194  		}
   195  
   196  		if bb[0] == '\\' { // Escape sequence.
   197  			p.reader.ReadByte() // Skip the escape \ byte.
   198  			b, err := p.reader.ReadByte()
   199  			if err != nil {
   200  				return cmapString{buf.String()}, err
   201  			}
   202  
   203  			// Octal '\ddd' number (base 8).
   204  			if core.IsOctalDigit(b) {
   205  				bb, err := p.reader.Peek(2)
   206  				if err != nil {
   207  					return cmapString{buf.String()}, err
   208  				}
   209  
   210  				numeric := []byte{}
   211  				numeric = append(numeric, b)
   212  				for _, val := range bb {
   213  					if core.IsOctalDigit(val) {
   214  						numeric = append(numeric, val)
   215  					} else {
   216  						break
   217  					}
   218  				}
   219  				p.reader.Discard(len(numeric) - 1)
   220  
   221  				common.Log.Trace("Numeric string \"%s\"", numeric)
   222  				code, err := strconv.ParseUint(string(numeric), 8, 32)
   223  				if err != nil {
   224  					return cmapString{buf.String()}, err
   225  				}
   226  				buf.WriteByte(byte(code))
   227  				continue
   228  			}
   229  
   230  			switch b {
   231  			case 'n':
   232  				buf.WriteByte('\n')
   233  			case 'r':
   234  				buf.WriteByte('\r')
   235  			case 't':
   236  				buf.WriteByte('\t')
   237  			case 'b':
   238  				buf.WriteByte('\b')
   239  			case 'f':
   240  				buf.WriteByte('\f')
   241  			case '(':
   242  				buf.WriteByte('(')
   243  			case ')':
   244  				buf.WriteByte(')')
   245  			case '\\':
   246  				buf.WriteByte('\\')
   247  			}
   248  
   249  			continue
   250  		} else if bb[0] == '(' {
   251  			count++
   252  		} else if bb[0] == ')' {
   253  			count--
   254  			if count == 0 {
   255  				p.reader.ReadByte()
   256  				break
   257  			}
   258  		}
   259  
   260  		b, _ := p.reader.ReadByte()
   261  		buf.WriteByte(b)
   262  	}
   263  
   264  	return cmapString{buf.String()}, nil
   265  }
   266  
   267  // Starts with '<' ends with '>'.
   268  // Currently not converting the hex codes to characters.
   269  func (p *cMapParser) parseHexString() (cmapHexString, error) {
   270  	p.reader.ReadByte()
   271  
   272  	hextable := []byte("0123456789abcdefABCDEF")
   273  
   274  	buf := bytes.Buffer{}
   275  
   276  	//tmp := []byte{}
   277  	for {
   278  		p.skipSpaces()
   279  
   280  		bb, err := p.reader.Peek(1)
   281  		if err != nil {
   282  			return cmapHexString{numBytes: 0, b: []byte("")}, err
   283  		}
   284  
   285  		if bb[0] == '>' {
   286  			p.reader.ReadByte()
   287  			break
   288  		}
   289  
   290  		b, _ := p.reader.ReadByte()
   291  		if bytes.IndexByte(hextable, b) >= 0 {
   292  			buf.WriteByte(b)
   293  		}
   294  	}
   295  
   296  	if buf.Len()%2 == 1 {
   297  		buf.WriteByte('0')
   298  	}
   299  	numBytes := buf.Len() / 2
   300  
   301  	hexb, _ := hex.DecodeString(buf.String())
   302  	return cmapHexString{numBytes: numBytes, b: hexb}, nil
   303  }
   304  
   305  // Starts with '[' ends with ']'.  Can contain any kinds of direct objects.
   306  func (p *cMapParser) parseArray() (cmapArray, error) {
   307  	arr := cmapArray{}
   308  	arr.Array = []cmapObject{}
   309  
   310  	p.reader.ReadByte()
   311  
   312  	for {
   313  		p.skipSpaces()
   314  
   315  		bb, err := p.reader.Peek(1)
   316  		if err != nil {
   317  			return arr, err
   318  		}
   319  
   320  		if bb[0] == ']' {
   321  			p.reader.ReadByte()
   322  			break
   323  		}
   324  
   325  		obj, err := p.parseObject()
   326  		if err != nil {
   327  			return arr, err
   328  		}
   329  		arr.Array = append(arr.Array, obj)
   330  	}
   331  
   332  	return arr, nil
   333  }
   334  
   335  // Reads and parses a PDF dictionary object enclosed with '<<' and '>>'
   336  func (p *cMapParser) parseDict() (cmapDict, error) {
   337  	common.Log.Trace("Reading PDF Dict!")
   338  
   339  	dict := makeDict()
   340  
   341  	// Pass the '<<'
   342  	c, _ := p.reader.ReadByte()
   343  	if c != '<' {
   344  		return dict, errors.New("Invalid dict")
   345  	}
   346  	c, _ = p.reader.ReadByte()
   347  	if c != '<' {
   348  		return dict, errors.New("Invalid dict")
   349  	}
   350  
   351  	for {
   352  		p.skipSpaces()
   353  
   354  		bb, err := p.reader.Peek(2)
   355  		if err != nil {
   356  			return dict, err
   357  		}
   358  
   359  		if (bb[0] == '>') && (bb[1] == '>') {
   360  			p.reader.ReadByte()
   361  			p.reader.ReadByte()
   362  			break
   363  		}
   364  
   365  		key, err := p.parseName()
   366  		common.Log.Trace("Key: %s", key.Name)
   367  		if err != nil {
   368  			common.Log.Debug("ERROR Returning name err %s", err)
   369  			return dict, err
   370  		}
   371  
   372  		p.skipSpaces()
   373  
   374  		val, err := p.parseObject()
   375  		if err != nil {
   376  			return dict, err
   377  		}
   378  		dict.Dict[key.Name] = val
   379  
   380  		// Skip "def" which optionally follows key value dict definitions in CMaps.
   381  		p.skipSpaces()
   382  		bb, err = p.reader.Peek(3)
   383  		if err != nil {
   384  			return dict, err
   385  		}
   386  		if string(bb) == "def" {
   387  			p.reader.Discard(3)
   388  		}
   389  
   390  	}
   391  
   392  	return dict, nil
   393  }
   394  
   395  func (p *cMapParser) parseNumber() (cmapObject, error) {
   396  	isFloat := false
   397  	allowSigns := true
   398  
   399  	numStr := bytes.Buffer{}
   400  	for {
   401  		bb, err := p.reader.Peek(1)
   402  		if err == io.EOF {
   403  			break
   404  		}
   405  		if err != nil {
   406  			return nil, err
   407  		}
   408  		if allowSigns && (bb[0] == '-' || bb[0] == '+') {
   409  			// Only appear in the beginning, otherwise serves as a delimiter.
   410  			b, _ := p.reader.ReadByte()
   411  			numStr.WriteByte(b)
   412  			allowSigns = false // Only allowed in beginning, and after e (exponential).
   413  		} else if core.IsDecimalDigit(bb[0]) {
   414  			b, _ := p.reader.ReadByte()
   415  			numStr.WriteByte(b)
   416  		} else if bb[0] == '.' {
   417  			b, _ := p.reader.ReadByte()
   418  			numStr.WriteByte(b)
   419  			isFloat = true
   420  		} else if bb[0] == 'e' {
   421  			// Exponential number format.
   422  			b, _ := p.reader.ReadByte()
   423  			numStr.WriteByte(b)
   424  			isFloat = true
   425  			allowSigns = true
   426  		} else {
   427  			break
   428  		}
   429  	}
   430  
   431  	if isFloat {
   432  		fVal, err := strconv.ParseFloat(numStr.String(), 64)
   433  		o := cmapFloat{fVal}
   434  		return o, err
   435  	}
   436  	intVal, err := strconv.ParseInt(numStr.String(), 10, 64)
   437  	o := cmapInt{intVal}
   438  	return o, err
   439  }
   440  
   441  // An operand is a text command represented by a word.
   442  func (p *cMapParser) parseOperand() (cmapOperand, error) {
   443  	op := cmapOperand{}
   444  
   445  	buf := bytes.Buffer{}
   446  	for {
   447  		bb, err := p.reader.Peek(1)
   448  		if err != nil {
   449  			if err == io.EOF {
   450  				break
   451  			}
   452  			return op, err
   453  		}
   454  		if core.IsDelimiter(bb[0]) {
   455  			break
   456  		}
   457  		if core.IsWhiteSpace(bb[0]) {
   458  			break
   459  		}
   460  
   461  		b, _ := p.reader.ReadByte()
   462  		buf.WriteByte(b)
   463  	}
   464  
   465  	if buf.Len() == 0 {
   466  		return op, fmt.Errorf("Invalid operand (empty)")
   467  	}
   468  
   469  	op.Operand = buf.String()
   470  
   471  	return op, nil
   472  }