github.com/andybalholm/giopdf@v0.0.0-20220317170119-aad9a095ad48/pdf/lex.go (about)

     1  // Copyright 2014 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Reading of PDF tokens and objects from a raw byte stream.
     6  
     7  package pdf
     8  
     9  import (
    10  	"fmt"
    11  	"io"
    12  	"strconv"
    13  )
    14  
    15  // A token is a PDF token in the input stream, one of the following Go types:
    16  //
    17  //	bool, a PDF boolean
    18  //	int64, a PDF integer
    19  //	float64, a PDF real
    20  //	string, a PDF string literal
    21  //	keyword, a PDF keyword
    22  //	name, a PDF name without the leading slash
    23  //
    24  type token interface{}
    25  
    26  // A name is a PDF name, without the leading slash.
    27  type name string
    28  
    29  // A keyword is a PDF keyword.
    30  // Delimiter tokens used in higher-level syntax,
    31  // such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords.
    32  type keyword string
    33  
    34  // A buffer holds buffered input bytes from the PDF file.
    35  type buffer struct {
    36  	r           io.Reader // source of data
    37  	buf         []byte    // buffered data
    38  	pos         int       // read index in buf
    39  	offset      int64     // offset at end of buf; aka offset of next read
    40  	tmp         []byte    // scratch space for accumulating token
    41  	unread      []token   // queue of read but then unread tokens
    42  	allowEOF    bool
    43  	allowObjptr bool
    44  	allowStream bool
    45  	eof         bool
    46  	key         []byte
    47  	useAES      bool
    48  	objptr      objptr
    49  }
    50  
    51  // newBuffer returns a new buffer reading from r at the given offset.
    52  func newBuffer(r io.Reader, offset int64) *buffer {
    53  	return &buffer{
    54  		r:           r,
    55  		offset:      offset,
    56  		buf:         make([]byte, 0, 4096),
    57  		allowObjptr: true,
    58  		allowStream: true,
    59  	}
    60  }
    61  
    62  func (b *buffer) seek(offset int64) {
    63  	b.offset = offset
    64  	b.buf = b.buf[:0]
    65  	b.pos = 0
    66  	b.unread = b.unread[:0]
    67  }
    68  
    69  func (b *buffer) readByte() byte {
    70  	if b.pos >= len(b.buf) {
    71  		b.reload()
    72  		if b.pos >= len(b.buf) {
    73  			return '\n'
    74  		}
    75  	}
    76  	c := b.buf[b.pos]
    77  	b.pos++
    78  	return c
    79  }
    80  
    81  func (b *buffer) errorf(format string, args ...interface{}) {
    82  	panic(fmt.Errorf(format, args...))
    83  }
    84  
    85  func (b *buffer) reload() bool {
    86  	n := cap(b.buf) - int(b.offset%int64(cap(b.buf)))
    87  	n, err := b.r.Read(b.buf[:n])
    88  	if n == 0 && err != nil {
    89  		b.buf = b.buf[:0]
    90  		b.pos = 0
    91  		if b.allowEOF && err == io.EOF {
    92  			b.eof = true
    93  			return false
    94  		}
    95  		b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)
    96  		return false
    97  	}
    98  	b.offset += int64(n)
    99  	b.buf = b.buf[:n]
   100  	b.pos = 0
   101  	return true
   102  }
   103  
   104  func (b *buffer) seekForward(offset int64) {
   105  	for b.offset < offset {
   106  		if !b.reload() {
   107  			return
   108  		}
   109  	}
   110  	b.pos = len(b.buf) - int(b.offset-offset)
   111  }
   112  
   113  func (b *buffer) readOffset() int64 {
   114  	return b.offset - int64(len(b.buf)) + int64(b.pos)
   115  }
   116  
   117  func (b *buffer) unreadByte() {
   118  	if b.pos > 0 {
   119  		b.pos--
   120  	}
   121  }
   122  
   123  func (b *buffer) unreadToken(t token) {
   124  	b.unread = append(b.unread, t)
   125  }
   126  
   127  func (b *buffer) readToken() token {
   128  	if n := len(b.unread); n > 0 {
   129  		t := b.unread[n-1]
   130  		b.unread = b.unread[:n-1]
   131  		return t
   132  	}
   133  
   134  	// Find first non-space, non-comment byte.
   135  	c := b.readByte()
   136  	for {
   137  		if isSpace(c) {
   138  			if b.eof {
   139  				return io.EOF
   140  			}
   141  			c = b.readByte()
   142  		} else if c == '%' {
   143  			for c != '\r' && c != '\n' {
   144  				c = b.readByte()
   145  			}
   146  		} else {
   147  			break
   148  		}
   149  	}
   150  
   151  	switch c {
   152  	case '<':
   153  		if b.readByte() == '<' {
   154  			return keyword("<<")
   155  		}
   156  		b.unreadByte()
   157  		return b.readHexString()
   158  
   159  	case '(':
   160  		return b.readLiteralString()
   161  
   162  	case '[', ']', '{', '}':
   163  		return keyword(string(c))
   164  
   165  	case '/':
   166  		return b.readName()
   167  
   168  	case '>':
   169  		if b.readByte() == '>' {
   170  			return keyword(">>")
   171  		}
   172  		b.unreadByte()
   173  		fallthrough
   174  
   175  	default:
   176  		if isDelim(c) {
   177  			b.errorf("unexpected delimiter %#q", rune(c))
   178  			return nil
   179  		}
   180  		b.unreadByte()
   181  		return b.readKeyword()
   182  	}
   183  }
   184  
   185  func (b *buffer) readHexString() token {
   186  	tmp := b.tmp[:0]
   187  	for {
   188  	Loop:
   189  		c := b.readByte()
   190  		if c == '>' {
   191  			break
   192  		}
   193  		if isSpace(c) {
   194  			goto Loop
   195  		}
   196  	Loop2:
   197  		c2 := b.readByte()
   198  		if isSpace(c2) {
   199  			goto Loop2
   200  		}
   201  		x := unhex(c)<<4 | unhex(c2)
   202  		if x < 0 {
   203  			b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])
   204  			break
   205  		}
   206  		tmp = append(tmp, byte(x))
   207  	}
   208  	b.tmp = tmp
   209  	return string(tmp)
   210  }
   211  
   212  func unhex(b byte) int {
   213  	switch {
   214  	case '0' <= b && b <= '9':
   215  		return int(b) - '0'
   216  	case 'a' <= b && b <= 'f':
   217  		return int(b) - 'a' + 10
   218  	case 'A' <= b && b <= 'F':
   219  		return int(b) - 'A' + 10
   220  	}
   221  	return -1
   222  }
   223  
   224  func (b *buffer) readLiteralString() token {
   225  	tmp := b.tmp[:0]
   226  	depth := 1
   227  Loop:
   228  	for {
   229  		c := b.readByte()
   230  		switch c {
   231  		default:
   232  			tmp = append(tmp, c)
   233  		case '(':
   234  			depth++
   235  			tmp = append(tmp, c)
   236  		case ')':
   237  			if depth--; depth == 0 {
   238  				break Loop
   239  			}
   240  			tmp = append(tmp, c)
   241  		case '\\':
   242  			switch c = b.readByte(); c {
   243  			default:
   244  				b.errorf("invalid escape sequence \\%c", c)
   245  				tmp = append(tmp, '\\', c)
   246  			case 'n':
   247  				tmp = append(tmp, '\n')
   248  			case 'r':
   249  				tmp = append(tmp, '\r')
   250  			case 'b':
   251  				tmp = append(tmp, '\b')
   252  			case 't':
   253  				tmp = append(tmp, '\t')
   254  			case 'f':
   255  				tmp = append(tmp, '\f')
   256  			case '(', ')', '\\':
   257  				tmp = append(tmp, c)
   258  			case '\r':
   259  				if b.readByte() != '\n' {
   260  					b.unreadByte()
   261  				}
   262  				fallthrough
   263  			case '\n':
   264  				// no append
   265  			case '0', '1', '2', '3', '4', '5', '6', '7':
   266  				x := int(c - '0')
   267  				for i := 0; i < 2; i++ {
   268  					c = b.readByte()
   269  					if c < '0' || c > '7' {
   270  						b.unreadByte()
   271  						break
   272  					}
   273  					x = x*8 + int(c-'0')
   274  				}
   275  				if x > 255 {
   276  					b.errorf("invalid octal escape \\%03o", x)
   277  				}
   278  				tmp = append(tmp, byte(x))
   279  			}
   280  		}
   281  	}
   282  	b.tmp = tmp
   283  	return string(tmp)
   284  }
   285  
   286  func (b *buffer) readName() token {
   287  	tmp := b.tmp[:0]
   288  	for {
   289  		c := b.readByte()
   290  		if isDelim(c) || isSpace(c) {
   291  			b.unreadByte()
   292  			break
   293  		}
   294  		if c == '#' {
   295  			x := unhex(b.readByte())<<4 | unhex(b.readByte())
   296  			if x < 0 {
   297  				b.errorf("malformed name")
   298  			}
   299  			tmp = append(tmp, byte(x))
   300  			continue
   301  		}
   302  		tmp = append(tmp, c)
   303  	}
   304  	b.tmp = tmp
   305  	return name(string(tmp))
   306  }
   307  
   308  func (b *buffer) readKeyword() token {
   309  	tmp := b.tmp[:0]
   310  	for {
   311  		c := b.readByte()
   312  		if isDelim(c) || isSpace(c) {
   313  			b.unreadByte()
   314  			break
   315  		}
   316  		tmp = append(tmp, c)
   317  	}
   318  	b.tmp = tmp
   319  	s := string(tmp)
   320  	switch {
   321  	case s == "true":
   322  		return true
   323  	case s == "false":
   324  		return false
   325  	case isInteger(s):
   326  		x, err := strconv.ParseInt(s, 10, 64)
   327  		if err != nil {
   328  			b.errorf("invalid integer %s", s)
   329  		}
   330  		return x
   331  	case isReal(s):
   332  		x, err := strconv.ParseFloat(s, 64)
   333  		if err != nil {
   334  			b.errorf("invalid real %s", s)
   335  		}
   336  		return x
   337  	}
   338  	return keyword(string(tmp))
   339  }
   340  
   341  func isInteger(s string) bool {
   342  	if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
   343  		s = s[1:]
   344  	}
   345  	if len(s) == 0 {
   346  		return false
   347  	}
   348  	for _, c := range s {
   349  		if c < '0' || '9' < c {
   350  			return false
   351  		}
   352  	}
   353  	return true
   354  }
   355  
   356  func isReal(s string) bool {
   357  	if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
   358  		s = s[1:]
   359  	}
   360  	if len(s) == 0 {
   361  		return false
   362  	}
   363  	ndot := 0
   364  	for _, c := range s {
   365  		if c == '.' {
   366  			ndot++
   367  			continue
   368  		}
   369  		if c < '0' || '9' < c {
   370  			return false
   371  		}
   372  	}
   373  	return ndot == 1
   374  }
   375  
   376  // An object is a PDF syntax object, one of the following Go types:
   377  //
   378  //	bool, a PDF boolean
   379  //	int64, a PDF integer
   380  //	float64, a PDF real
   381  //	string, a PDF string literal
   382  //	name, a PDF name without the leading slash
   383  //	dict, a PDF dictionary
   384  //	array, a PDF array
   385  //	stream, a PDF stream
   386  //	objptr, a PDF object reference
   387  //	objdef, a PDF object definition
   388  //
   389  // An object may also be nil, to represent the PDF null.
   390  type object interface{}
   391  
   392  type dict map[name]object
   393  
   394  type array []object
   395  
   396  type stream struct {
   397  	hdr    dict
   398  	ptr    objptr
   399  	offset int64
   400  }
   401  
   402  type objptr struct {
   403  	id  uint32
   404  	gen uint16
   405  }
   406  
   407  type objdef struct {
   408  	ptr objptr
   409  	obj object
   410  }
   411  
   412  func (b *buffer) readObject() object {
   413  	tok := b.readToken()
   414  	if kw, ok := tok.(keyword); ok {
   415  		switch kw {
   416  		case "null":
   417  			return nil
   418  		case "<<":
   419  			return b.readDict()
   420  		case "[":
   421  			return b.readArray()
   422  		}
   423  		b.errorf("unexpected keyword %q parsing object", kw)
   424  		return nil
   425  	}
   426  
   427  	if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 {
   428  		tok = decryptString(b.key, b.useAES, b.objptr, str)
   429  	}
   430  
   431  	if !b.allowObjptr {
   432  		return tok
   433  	}
   434  
   435  	if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 {
   436  		tok2 := b.readToken()
   437  		if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 {
   438  			tok3 := b.readToken()
   439  			switch tok3 {
   440  			case keyword("R"):
   441  				return objptr{uint32(t1), uint16(t2)}
   442  			case keyword("obj"):
   443  				old := b.objptr
   444  				b.objptr = objptr{uint32(t1), uint16(t2)}
   445  				obj := b.readObject()
   446  				if _, ok := obj.(stream); !ok {
   447  					tok4 := b.readToken()
   448  					if tok4 != keyword("endobj") {
   449  						b.errorf("missing endobj after indirect object definition")
   450  						b.unreadToken(tok4)
   451  					}
   452  				}
   453  				b.objptr = old
   454  				return objdef{objptr{uint32(t1), uint16(t2)}, obj}
   455  			}
   456  			b.unreadToken(tok3)
   457  		}
   458  		b.unreadToken(tok2)
   459  	}
   460  	return tok
   461  }
   462  
   463  func (b *buffer) readArray() object {
   464  	var x array
   465  	for {
   466  		tok := b.readToken()
   467  		if tok == nil || tok == keyword("]") {
   468  			break
   469  		}
   470  		b.unreadToken(tok)
   471  		x = append(x, b.readObject())
   472  	}
   473  	return x
   474  }
   475  
   476  func (b *buffer) readDict() object {
   477  	x := make(dict)
   478  	for {
   479  		tok := b.readToken()
   480  		if tok == nil || tok == keyword(">>") {
   481  			break
   482  		}
   483  		n, ok := tok.(name)
   484  		if !ok {
   485  			b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok)
   486  			continue
   487  		}
   488  		x[n] = b.readObject()
   489  	}
   490  
   491  	if !b.allowStream {
   492  		return x
   493  	}
   494  
   495  	tok := b.readToken()
   496  	if tok != keyword("stream") {
   497  		b.unreadToken(tok)
   498  		return x
   499  	}
   500  
   501  	switch b.readByte() {
   502  	case '\r':
   503  		if b.readByte() != '\n' {
   504  			b.unreadByte()
   505  		}
   506  	case '\n':
   507  		// ok
   508  	default:
   509  		b.errorf("stream keyword not followed by newline")
   510  	}
   511  
   512  	return stream{x, b.objptr, b.readOffset()}
   513  }
   514  
   515  func isSpace(b byte) bool {
   516  	switch b {
   517  	case '\x00', '\t', '\n', '\f', '\r', ' ':
   518  		return true
   519  	}
   520  	return false
   521  }
   522  
   523  func isDelim(b byte) bool {
   524  	switch b {
   525  	case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%':
   526  		return true
   527  	}
   528  	return false
   529  }