github.com/andybalholm/giopdf@v0.0.0-20220317170119-aad9a095ad48/pdf/read.go (about)

     1  // Copyright 2014 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package pdf implements reading of PDF files.
     6  //
     7  // Overview
     8  //
     9  // PDF is Adobe's Portable Document Format, ubiquitous on the internet.
    10  // A PDF document is a complex data format built on a fairly simple structure.
    11  // This package exposes the simple structure along with some wrappers to
    12  // extract basic information. If more complex information is needed, it is
    13  // possible to extract that information by interpreting the structure exposed
    14  // by this package.
    15  //
    16  // Specifically, a PDF is a data structure built from Values, each of which has
    17  // one of the following Kinds:
    18  //
    19  //	Null, for the null object.
    20  //	Integer, for an integer.
    21  //	Real, for a floating-point number.
    22  //	Bool, for a boolean value.
    23  //	Name, for a name constant (as in /Helvetica).
    24  //	String, for a string constant.
    25  //	Dict, for a dictionary of name-value pairs.
    26  //	Array, for an array of values.
    27  //	Stream, for an opaque data stream and associated header dictionary.
    28  //
    29  // The accessors on Value—Int64, Float64, Bool, Name, and so on—return
    30  // a view of the data as the given type. When there is no appropriate view,
    31  // the accessor returns a zero result. For example, the Name accessor returns
    32  // the empty string if called on a Value v for which v.Kind() != Name.
    33  // Returning zero values this way, especially from the Dict and Array accessors,
    34  // which themselves return Values, makes it possible to traverse a PDF quickly
    35  // without writing any error checking. On the other hand, it means that mistakes
    36  // can go unreported.
    37  //
    38  // The basic structure of the PDF file is exposed as the graph of Values.
    39  //
    40  // Most richer data structures in a PDF file are dictionaries with specific interpretations
    41  // of the name-value pairs. The Font and Page wrappers make the interpretation
    42  // of a specific Value as the corresponding type easier. They are only helpers, though:
    43  // they are implemented only in terms of the Value API and could be moved outside
    44  // the package. Equally important, traversal of other PDF data structures can be implemented
    45  // in other packages as needed.
    46  //
    47  package pdf // import "rsc.io/pdf"
    48  
    49  // BUG(rsc): The package is incomplete, although it has been used successfully on some
    50  // large real-world PDF files.
    51  
    52  // BUG(rsc): There is no support for closing open PDF files. If you drop all references to a Reader,
    53  // the underlying reader will eventually be garbage collected.
    54  
    55  // BUG(rsc): The library makes no attempt at efficiency. A value cache maintained in the Reader
    56  // would probably help significantly.
    57  
    58  // BUG(rsc): The support for reading encrypted files is weak.
    59  
    60  // BUG(rsc): The Value API does not support error reporting. The intent is to allow users to
    61  // set an error reporting callback in Reader, but that code has not been implemented.
    62  
    63  import (
    64  	"bytes"
    65  	"compress/zlib"
    66  	"crypto/aes"
    67  	"crypto/cipher"
    68  	"crypto/md5"
    69  	"crypto/rc4"
    70  	"fmt"
    71  	"io"
    72  	"io/ioutil"
    73  	"os"
    74  	"sort"
    75  	"strconv"
    76  
    77  	"golang.org/x/image/ccitt"
    78  )
    79  
    80  // A Reader is a single PDF file open for reading.
    81  type Reader struct {
    82  	f          io.ReaderAt
    83  	end        int64
    84  	xref       []xref
    85  	trailer    dict
    86  	trailerptr objptr
    87  	key        []byte
    88  	useAES     bool
    89  }
    90  
    91  type xref struct {
    92  	ptr      objptr
    93  	inStream bool
    94  	stream   objptr
    95  	offset   int64
    96  }
    97  
    98  func (r *Reader) errorf(format string, args ...interface{}) {
    99  	panic(fmt.Errorf(format, args...))
   100  }
   101  
   102  // Open opens a file for reading.
   103  func Open(file string) (*Reader, error) {
   104  	// TODO: Deal with closing file.
   105  	f, err := os.Open(file)
   106  	if err != nil {
   107  		return nil, err
   108  	}
   109  	fi, err := f.Stat()
   110  	if err != nil {
   111  		f.Close()
   112  		return nil, err
   113  	}
   114  	return NewReader(f, fi.Size())
   115  }
   116  
   117  // NewReader opens a file for reading, using the data in f with the given total size.
   118  func NewReader(f io.ReaderAt, size int64) (*Reader, error) {
   119  	return NewReaderEncrypted(f, size, nil)
   120  }
   121  
   122  // NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
   123  // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords
   124  // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt
   125  // the file and returns an error.
   126  func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) {
   127  	buf := make([]byte, 10)
   128  	f.ReadAt(buf, 0)
   129  	if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' && buf[8] != ' ' {
   130  		return nil, fmt.Errorf("not a PDF file: invalid header")
   131  	}
   132  	end := size
   133  	const endChunk = 100
   134  	buf = make([]byte, endChunk)
   135  	f.ReadAt(buf, end-endChunk)
   136  	for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' {
   137  		buf = buf[:len(buf)-1]
   138  	}
   139  	buf = bytes.TrimRight(buf, "\r\n\t ")
   140  	if !bytes.HasSuffix(buf, []byte("%%EOF")) {
   141  		return nil, fmt.Errorf("not a PDF file: missing %%%%EOF")
   142  	}
   143  	i := findLastLine(buf, "startxref")
   144  	if i < 0 {
   145  		return nil, fmt.Errorf("malformed PDF file: missing final startxref")
   146  	}
   147  
   148  	r := &Reader{
   149  		f:   f,
   150  		end: end,
   151  	}
   152  	pos := end - endChunk + int64(i)
   153  	b := newBuffer(io.NewSectionReader(f, pos, end-pos), pos)
   154  	if b.readToken() != keyword("startxref") {
   155  		return nil, fmt.Errorf("malformed PDF file: missing startxref")
   156  	}
   157  	startxref, ok := b.readToken().(int64)
   158  	if !ok {
   159  		return nil, fmt.Errorf("malformed PDF file: startxref not followed by integer")
   160  	}
   161  	b = newBuffer(io.NewSectionReader(r.f, startxref, r.end-startxref), startxref)
   162  	xref, trailerptr, trailer, err := readXref(r, b)
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  	r.xref = xref
   167  	r.trailer = trailer
   168  	r.trailerptr = trailerptr
   169  	if trailer["Encrypt"] == nil {
   170  		return r, nil
   171  	}
   172  	err = r.initEncrypt("")
   173  	if err == nil {
   174  		return r, nil
   175  	}
   176  	if pw == nil || err != ErrInvalidPassword {
   177  		return nil, err
   178  	}
   179  	for {
   180  		next := pw()
   181  		if next == "" {
   182  			break
   183  		}
   184  		if r.initEncrypt(next) == nil {
   185  			return r, nil
   186  		}
   187  	}
   188  	return nil, err
   189  }
   190  
   191  // Trailer returns the file's Trailer value.
   192  func (r *Reader) Trailer() Value {
   193  	return Value{r, r.trailerptr, r.trailer}
   194  }
   195  
   196  func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) {
   197  	tok := b.readToken()
   198  	if tok == keyword("xref") {
   199  		return readXrefTable(r, b)
   200  	}
   201  	if _, ok := tok.(int64); ok {
   202  		b.unreadToken(tok)
   203  		return readXrefStream(r, b)
   204  	}
   205  	return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", tok)
   206  }
   207  
   208  func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) {
   209  	obj1 := b.readObject()
   210  	obj, ok := obj1.(objdef)
   211  	if !ok {
   212  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj1))
   213  	}
   214  	strmptr := obj.ptr
   215  	strm, ok := obj.obj.(stream)
   216  	if !ok {
   217  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj))
   218  	}
   219  	if strm.hdr["Type"] != name("XRef") {
   220  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream does not have type XRef")
   221  	}
   222  	size, ok := strm.hdr["Size"].(int64)
   223  	if !ok {
   224  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream missing Size")
   225  	}
   226  	table := make([]xref, size)
   227  
   228  	table, err := readXrefStreamData(r, strm, table, size)
   229  	if err != nil {
   230  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err)
   231  	}
   232  
   233  	for prevoff := strm.hdr["Prev"]; prevoff != nil; {
   234  		off, ok := prevoff.(int64)
   235  		if !ok {
   236  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff)
   237  		}
   238  		b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off)
   239  		obj1 := b.readObject()
   240  		obj, ok := obj1.(objdef)
   241  		if !ok {
   242  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj1))
   243  		}
   244  		prevstrm, ok := obj.obj.(stream)
   245  		if !ok {
   246  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj))
   247  		}
   248  		prevoff = prevstrm.hdr["Prev"]
   249  		prev := Value{r, objptr{}, prevstrm}
   250  		if prev.Kind() != Stream {
   251  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream is not stream: %v", prev)
   252  		}
   253  		if prev.Key("Type").Name() != "XRef" {
   254  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream does not have type XRef")
   255  		}
   256  		psize := prev.Key("Size").Int64()
   257  		if psize > size {
   258  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream larger than last stream")
   259  		}
   260  		if table, err = readXrefStreamData(r, prev.data.(stream), table, psize); err != nil {
   261  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: reading xref prev stream: %v", err)
   262  		}
   263  	}
   264  
   265  	return table, strmptr, strm.hdr, nil
   266  }
   267  
   268  func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xref, error) {
   269  	index, _ := strm.hdr["Index"].(array)
   270  	if index == nil {
   271  		index = array{int64(0), size}
   272  	}
   273  	if len(index)%2 != 0 {
   274  		return nil, fmt.Errorf("invalid Index array %v", objfmt(index))
   275  	}
   276  	ww, ok := strm.hdr["W"].(array)
   277  	if !ok {
   278  		return nil, fmt.Errorf("xref stream missing W array")
   279  	}
   280  
   281  	var w []int
   282  	for _, x := range ww {
   283  		i, ok := x.(int64)
   284  		if !ok || int64(int(i)) != i {
   285  			return nil, fmt.Errorf("invalid W array %v", objfmt(ww))
   286  		}
   287  		w = append(w, int(i))
   288  	}
   289  	if len(w) < 3 {
   290  		return nil, fmt.Errorf("invalid W array %v", objfmt(ww))
   291  	}
   292  
   293  	v := Value{r, objptr{}, strm}
   294  	wtotal := 0
   295  	for _, wid := range w {
   296  		wtotal += wid
   297  	}
   298  	buf := make([]byte, wtotal)
   299  	data := v.Reader()
   300  	for len(index) > 0 {
   301  		start, ok1 := index[0].(int64)
   302  		n, ok2 := index[1].(int64)
   303  		if !ok1 || !ok2 {
   304  			return nil, fmt.Errorf("malformed Index pair %v %v %T %T", objfmt(index[0]), objfmt(index[1]), index[0], index[1])
   305  		}
   306  		index = index[2:]
   307  		for i := 0; i < int(n); i++ {
   308  			_, err := io.ReadFull(data, buf)
   309  			if err != nil {
   310  				return nil, fmt.Errorf("error reading xref stream: %v", err)
   311  			}
   312  			v1 := decodeInt(buf[0:w[0]])
   313  			if w[0] == 0 {
   314  				v1 = 1
   315  			}
   316  			v2 := decodeInt(buf[w[0] : w[0]+w[1]])
   317  			v3 := decodeInt(buf[w[0]+w[1] : w[0]+w[1]+w[2]])
   318  			x := int(start) + i
   319  			for cap(table) <= x {
   320  				table = append(table[:cap(table)], xref{})
   321  			}
   322  			if table[x].ptr != (objptr{}) {
   323  				continue
   324  			}
   325  			switch v1 {
   326  			case 0:
   327  				table[x] = xref{ptr: objptr{0, 65535}}
   328  			case 1:
   329  				table[x] = xref{ptr: objptr{uint32(x), uint16(v3)}, offset: int64(v2)}
   330  			case 2:
   331  				table[x] = xref{ptr: objptr{uint32(x), 0}, inStream: true, stream: objptr{uint32(v2), 0}, offset: int64(v3)}
   332  			default:
   333  				fmt.Printf("invalid xref stream type %d: %x\n", v1, buf)
   334  			}
   335  		}
   336  	}
   337  	return table, nil
   338  }
   339  
   340  func decodeInt(b []byte) int {
   341  	x := 0
   342  	for _, c := range b {
   343  		x = x<<8 | int(c)
   344  	}
   345  	return x
   346  }
   347  
   348  func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) {
   349  	var table []xref
   350  
   351  	table, err := readXrefTableData(b, table)
   352  	if err != nil {
   353  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err)
   354  	}
   355  
   356  	trailer, ok := b.readObject().(dict)
   357  	if !ok {
   358  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref table not followed by trailer dictionary")
   359  	}
   360  
   361  	for prevoff := trailer["Prev"]; prevoff != nil; {
   362  		off, ok := prevoff.(int64)
   363  		if !ok {
   364  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff)
   365  		}
   366  		b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off)
   367  		tok := b.readToken()
   368  		if tok != keyword("xref") {
   369  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev does not point to xref")
   370  		}
   371  		table, err = readXrefTableData(b, table)
   372  		if err != nil {
   373  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err)
   374  		}
   375  
   376  		trailer, ok := b.readObject().(dict)
   377  		if !ok {
   378  			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev table not followed by trailer dictionary")
   379  		}
   380  		prevoff = trailer["Prev"]
   381  	}
   382  
   383  	size, ok := trailer[name("Size")].(int64)
   384  	if !ok {
   385  		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: trailer missing /Size entry")
   386  	}
   387  
   388  	if size < int64(len(table)) {
   389  		table = table[:size]
   390  	}
   391  
   392  	return table, objptr{}, trailer, nil
   393  }
   394  
   395  func readXrefTableData(b *buffer, table []xref) ([]xref, error) {
   396  	for {
   397  		tok := b.readToken()
   398  		if tok == keyword("trailer") {
   399  			break
   400  		}
   401  		start, ok1 := tok.(int64)
   402  		n, ok2 := b.readToken().(int64)
   403  		if !ok1 || !ok2 {
   404  			return nil, fmt.Errorf("malformed xref table")
   405  		}
   406  		for i := 0; i < int(n); i++ {
   407  			off, ok1 := b.readToken().(int64)
   408  			gen, ok2 := b.readToken().(int64)
   409  			alloc, ok3 := b.readToken().(keyword)
   410  			if !ok1 || !ok2 || !ok3 || alloc != keyword("f") && alloc != keyword("n") {
   411  				return nil, fmt.Errorf("malformed xref table")
   412  			}
   413  			x := int(start) + i
   414  			for cap(table) <= x {
   415  				table = append(table[:cap(table)], xref{})
   416  			}
   417  			if len(table) <= x {
   418  				table = table[:x+1]
   419  			}
   420  			if alloc == "n" && table[x].offset == 0 {
   421  				table[x] = xref{ptr: objptr{uint32(x), uint16(gen)}, offset: int64(off)}
   422  			}
   423  		}
   424  	}
   425  	return table, nil
   426  }
   427  
   428  func findLastLine(buf []byte, s string) int {
   429  	bs := []byte(s)
   430  	max := len(buf)
   431  	for {
   432  		i := bytes.LastIndex(buf[:max], bs)
   433  		if i <= 0 || i+len(bs) >= len(buf) {
   434  			return -1
   435  		}
   436  		if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') {
   437  			return i
   438  		}
   439  		max = i
   440  	}
   441  }
   442  
   443  // A Value is a single PDF value, such as an integer, dictionary, or array.
   444  // The zero Value is a PDF null (Kind() == Null, IsNull() = true).
   445  type Value struct {
   446  	r    *Reader
   447  	ptr  objptr
   448  	data interface{}
   449  }
   450  
   451  // IsNull reports whether the value is a null. It is equivalent to Kind() == Null.
   452  func (v Value) IsNull() bool {
   453  	return v.data == nil
   454  }
   455  
   456  // A ValueKind specifies the kind of data underlying a Value.
   457  type ValueKind int
   458  
   459  // The PDF value kinds.
   460  const (
   461  	Null ValueKind = iota
   462  	Bool
   463  	Integer
   464  	Real
   465  	String
   466  	Name
   467  	Dict
   468  	Array
   469  	Stream
   470  )
   471  
   472  // Kind reports the kind of value underlying v.
   473  func (v Value) Kind() ValueKind {
   474  	switch v.data.(type) {
   475  	default:
   476  		return Null
   477  	case bool:
   478  		return Bool
   479  	case int64:
   480  		return Integer
   481  	case float64:
   482  		return Real
   483  	case string:
   484  		return String
   485  	case name:
   486  		return Name
   487  	case dict:
   488  		return Dict
   489  	case array:
   490  		return Array
   491  	case stream:
   492  		return Stream
   493  	}
   494  }
   495  
   496  // String returns a textual representation of the value v.
   497  // Note that String is not the accessor for values with Kind() == String.
   498  // To access such values, see RawString, Text, and TextFromUTF16.
   499  func (v Value) String() string {
   500  	return objfmt(v.data)
   501  }
   502  
   503  func objfmt(x interface{}) string {
   504  	switch x := x.(type) {
   505  	default:
   506  		return fmt.Sprint(x)
   507  	case string:
   508  		if isPDFDocEncoded(x) {
   509  			return strconv.Quote(pdfDocDecode(x))
   510  		}
   511  		if isUTF16(x) {
   512  			return strconv.Quote(utf16Decode(x[2:]))
   513  		}
   514  		return strconv.Quote(x)
   515  	case name:
   516  		return "/" + string(x)
   517  	case dict:
   518  		var keys []string
   519  		for k := range x {
   520  			keys = append(keys, string(k))
   521  		}
   522  		sort.Strings(keys)
   523  		var buf bytes.Buffer
   524  		buf.WriteString("<<")
   525  		for i, k := range keys {
   526  			elem := x[name(k)]
   527  			if i > 0 {
   528  				buf.WriteString(" ")
   529  			}
   530  			buf.WriteString("/")
   531  			buf.WriteString(k)
   532  			buf.WriteString(" ")
   533  			buf.WriteString(objfmt(elem))
   534  		}
   535  		buf.WriteString(">>")
   536  		return buf.String()
   537  
   538  	case array:
   539  		var buf bytes.Buffer
   540  		buf.WriteString("[")
   541  		for i, elem := range x {
   542  			if i > 0 {
   543  				buf.WriteString(" ")
   544  			}
   545  			buf.WriteString(objfmt(elem))
   546  		}
   547  		buf.WriteString("]")
   548  		return buf.String()
   549  
   550  	case stream:
   551  		return fmt.Sprintf("%v@%d", objfmt(x.hdr), x.offset)
   552  
   553  	case objptr:
   554  		return fmt.Sprintf("%d %d R", x.id, x.gen)
   555  
   556  	case objdef:
   557  		return fmt.Sprintf("{%d %d obj}%v", x.ptr.id, x.ptr.gen, objfmt(x.obj))
   558  	}
   559  }
   560  
   561  // Bool returns v's boolean value.
   562  // If v.Kind() != Bool, Bool returns false.
   563  func (v Value) Bool() bool {
   564  	x, ok := v.data.(bool)
   565  	if !ok {
   566  		return false
   567  	}
   568  	return x
   569  }
   570  
   571  // Int64 returns v's int64 value.
   572  // If v.Kind() != Int64, Int64 returns 0.
   573  func (v Value) Int64() int64 {
   574  	x, ok := v.data.(int64)
   575  	if !ok {
   576  		return 0
   577  	}
   578  	return x
   579  }
   580  
   581  // Int returns v's int value, converting from int64.
   582  // If v.Kind() != Int64, Int returns 0.
   583  func (v Value) Int() int {
   584  	x, ok := v.data.(int64)
   585  	if !ok {
   586  		return 0
   587  	}
   588  	return int(x)
   589  }
   590  
   591  // Float64 returns v's float64 value, converting from integer if necessary.
   592  // If v.Kind() != Float64 and v.Kind() != Int64, Float64 returns 0.
   593  func (v Value) Float64() float64 {
   594  	x, ok := v.data.(float64)
   595  	if !ok {
   596  		x, ok := v.data.(int64)
   597  		if ok {
   598  			return float64(x)
   599  		}
   600  		return 0
   601  	}
   602  	return x
   603  }
   604  
   605  // Float32 returns v's float32 value, converting from integer or float64.
   606  // If v.Kind() != Float64 and v.Kind() != Int64, Float32 returns 0.
   607  func (v Value) Float32() float32 {
   608  	x, ok := v.data.(float64)
   609  	if !ok {
   610  		x, ok := v.data.(int64)
   611  		if ok {
   612  			return float32(x)
   613  		}
   614  		return 0
   615  	}
   616  	return float32(x)
   617  }
   618  
   619  // RawString returns v's string value.
   620  // If v.Kind() != String, RawString returns the empty string.
   621  func (v Value) RawString() string {
   622  	x, ok := v.data.(string)
   623  	if !ok {
   624  		return ""
   625  	}
   626  	return x
   627  }
   628  
   629  // Text returns v's string value interpreted as a ``text string'' (defined in the PDF spec)
   630  // and converted to UTF-8.
   631  // If v.Kind() != String, Text returns the empty string.
   632  func (v Value) Text() string {
   633  	x, ok := v.data.(string)
   634  	if !ok {
   635  		return ""
   636  	}
   637  	if isPDFDocEncoded(x) {
   638  		return pdfDocDecode(x)
   639  	}
   640  	if isUTF16(x) {
   641  		return utf16Decode(x[2:])
   642  	}
   643  	return x
   644  }
   645  
   646  // TextFromUTF16 returns v's string value interpreted as big-endian UTF-16
   647  // and then converted to UTF-8.
   648  // If v.Kind() != String or if the data is not valid UTF-16, TextFromUTF16 returns
   649  // the empty string.
   650  func (v Value) TextFromUTF16() string {
   651  	x, ok := v.data.(string)
   652  	if !ok {
   653  		return ""
   654  	}
   655  	if len(x)%2 == 1 {
   656  		return ""
   657  	}
   658  	if x == "" {
   659  		return ""
   660  	}
   661  	return utf16Decode(x)
   662  }
   663  
   664  // Name returns v's name value.
   665  // If v.Kind() != Name, Name returns the empty string.
   666  // The returned name does not include the leading slash:
   667  // if v corresponds to the name written using the syntax /Helvetica,
   668  // Name() == "Helvetica".
   669  func (v Value) Name() string {
   670  	x, ok := v.data.(name)
   671  	if !ok {
   672  		return ""
   673  	}
   674  	return string(x)
   675  }
   676  
   677  // Key returns the value associated with the given name key in the dictionary v.
   678  // Like the result of the Name method, the key should not include a leading slash.
   679  // If v is a stream, Key applies to the stream's header dictionary.
   680  // If v.Kind() != Dict and v.Kind() != Stream, Key returns a null Value.
   681  func (v Value) Key(key string) Value {
   682  	x, ok := v.data.(dict)
   683  	if !ok {
   684  		strm, ok := v.data.(stream)
   685  		if !ok {
   686  			return Value{}
   687  		}
   688  		x = strm.hdr
   689  	}
   690  	return v.r.resolve(v.ptr, x[name(key)])
   691  }
   692  
   693  // Keys returns a sorted list of the keys in the dictionary v.
   694  // If v is a stream, Keys applies to the stream's header dictionary.
   695  // If v.Kind() != Dict and v.Kind() != Stream, Keys returns nil.
   696  func (v Value) Keys() []string {
   697  	x, ok := v.data.(dict)
   698  	if !ok {
   699  		strm, ok := v.data.(stream)
   700  		if !ok {
   701  			return nil
   702  		}
   703  		x = strm.hdr
   704  	}
   705  	keys := []string{} // not nil
   706  	for k := range x {
   707  		keys = append(keys, string(k))
   708  	}
   709  	sort.Strings(keys)
   710  	return keys
   711  }
   712  
   713  // Index returns the i'th element in the array v.
   714  // If v.Kind() != Array or if i is outside the array bounds,
   715  // Index returns a null Value.
   716  func (v Value) Index(i int) Value {
   717  	x, ok := v.data.(array)
   718  	if !ok || i < 0 || i >= len(x) {
   719  		return Value{}
   720  	}
   721  	return v.r.resolve(v.ptr, x[i])
   722  }
   723  
   724  // Len returns the length of the array v.
   725  // If v.Kind() != Array, Len returns 0.
   726  func (v Value) Len() int {
   727  	x, ok := v.data.(array)
   728  	if !ok {
   729  		return 0
   730  	}
   731  	return len(x)
   732  }
   733  
   734  func (r *Reader) resolve(parent objptr, x interface{}) Value {
   735  	if ptr, ok := x.(objptr); ok {
   736  		if ptr.id >= uint32(len(r.xref)) {
   737  			return Value{}
   738  		}
   739  		xref := r.xref[ptr.id]
   740  		if xref.ptr != ptr || !xref.inStream && xref.offset == 0 {
   741  			return Value{}
   742  		}
   743  		var obj object
   744  		if xref.inStream {
   745  			strm := r.resolve(parent, xref.stream)
   746  		Search:
   747  			for {
   748  				if strm.Kind() != Stream {
   749  					panic("not a stream")
   750  				}
   751  				if strm.Key("Type").Name() != "ObjStm" {
   752  					panic("not an object stream")
   753  				}
   754  				n := int(strm.Key("N").Int64())
   755  				first := strm.Key("First").Int64()
   756  				if first == 0 {
   757  					panic("missing First")
   758  				}
   759  				b := newBuffer(strm.Reader(), 0)
   760  				b.allowEOF = true
   761  				for i := 0; i < n; i++ {
   762  					id, _ := b.readToken().(int64)
   763  					off, _ := b.readToken().(int64)
   764  					if uint32(id) == ptr.id {
   765  						b.seekForward(first + off)
   766  						x = b.readObject()
   767  						break Search
   768  					}
   769  				}
   770  				ext := strm.Key("Extends")
   771  				if ext.Kind() != Stream {
   772  					panic("cannot find object in stream")
   773  				}
   774  				strm = ext
   775  			}
   776  		} else {
   777  			b := newBuffer(io.NewSectionReader(r.f, xref.offset, r.end-xref.offset), xref.offset)
   778  			b.key = r.key
   779  			b.useAES = r.useAES
   780  			obj = b.readObject()
   781  			def, ok := obj.(objdef)
   782  			if !ok {
   783  				panic(fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj))
   784  				//return Value{}
   785  			}
   786  			if def.ptr != ptr {
   787  				panic(fmt.Errorf("loading %v: found %v", ptr, def.ptr))
   788  			}
   789  			x = def.obj
   790  		}
   791  		parent = ptr
   792  	}
   793  
   794  	switch x := x.(type) {
   795  	case nil, bool, int64, float64, name, dict, array, stream:
   796  		return Value{r, parent, x}
   797  	case string:
   798  		return Value{r, parent, x}
   799  	default:
   800  		panic(fmt.Errorf("unexpected value type %T in resolve", x))
   801  	}
   802  }
   803  
   804  type errorReadCloser struct {
   805  	err error
   806  }
   807  
   808  func (e *errorReadCloser) Read([]byte) (int, error) {
   809  	return 0, e.err
   810  }
   811  
   812  func (e *errorReadCloser) Close() error {
   813  	return e.err
   814  }
   815  
   816  // Reader returns the data contained in the stream v.
   817  // If v.Kind() != Stream, Reader returns a ReadCloser that
   818  // responds to all reads with a ``stream not present'' error.
   819  func (v Value) Reader() io.ReadCloser {
   820  	x, ok := v.data.(stream)
   821  	if !ok {
   822  		return &errorReadCloser{fmt.Errorf("stream not present")}
   823  	}
   824  	var rd io.Reader
   825  	rd = io.NewSectionReader(v.r.f, x.offset, v.Key("Length").Int64())
   826  	if v.r.key != nil {
   827  		rd = decryptStream(v.r.key, v.r.useAES, x.ptr, rd)
   828  	}
   829  	filter := v.Key("Filter")
   830  	param := v.Key("DecodeParms")
   831  	switch filter.Kind() {
   832  	default:
   833  		panic(fmt.Errorf("unsupported filter %v", filter))
   834  	case Null:
   835  		// ok
   836  	case Name:
   837  		rd = applyFilter(rd, filter.Name(), param)
   838  	case Array:
   839  		for i := 0; i < filter.Len(); i++ {
   840  			rd = applyFilter(rd, filter.Index(i).Name(), param.Index(i))
   841  		}
   842  	}
   843  
   844  	return ioutil.NopCloser(rd)
   845  }
   846  
   847  func applyFilter(rd io.Reader, name string, param Value) io.Reader {
   848  	switch name {
   849  	default:
   850  		panic("unknown filter " + name)
   851  	case "FlateDecode":
   852  		zr, err := zlib.NewReader(rd)
   853  		if err != nil {
   854  			panic(err)
   855  		}
   856  		pred := param.Key("Predictor")
   857  		if pred.Kind() == Null {
   858  			return zr
   859  		}
   860  		columns := param.Key("Columns").Int64()
   861  		switch pred.Int64() {
   862  		default:
   863  			fmt.Println("unknown predictor", pred)
   864  			panic("pred")
   865  		case 12:
   866  			return &pngUpReader{r: zr, hist: make([]byte, 1+columns), tmp: make([]byte, 1+columns)}
   867  		}
   868  
   869  	case "CCITTFaxDecode":
   870  		sf := ccitt.Group3
   871  		if param.Key("K").Int() < 0 {
   872  			sf = ccitt.Group4
   873  		}
   874  		width := 1728
   875  		if cols := param.Key("Columns"); !cols.IsNull() {
   876  			width = cols.Int()
   877  		}
   878  		height := ccitt.AutoDetectHeight
   879  		if rows := param.Key("Rows").Int(); rows != 0 {
   880  			height = rows
   881  		}
   882  		invert := param.Key("BlackIs1").Bool()
   883  		return ccitt.NewReader(rd, ccitt.MSB, sf, width, height, &ccitt.Options{Invert: invert})
   884  	}
   885  }
   886  
   887  type pngUpReader struct {
   888  	r    io.Reader
   889  	hist []byte
   890  	tmp  []byte
   891  	pend []byte
   892  }
   893  
   894  func (r *pngUpReader) Read(b []byte) (int, error) {
   895  	n := 0
   896  	for len(b) > 0 {
   897  		if len(r.pend) > 0 {
   898  			m := copy(b, r.pend)
   899  			n += m
   900  			b = b[m:]
   901  			r.pend = r.pend[m:]
   902  			continue
   903  		}
   904  		_, err := io.ReadFull(r.r, r.tmp)
   905  		if err != nil {
   906  			return n, err
   907  		}
   908  		if r.tmp[0] != 2 {
   909  			return n, fmt.Errorf("malformed PNG-Up encoding")
   910  		}
   911  		for i, b := range r.tmp {
   912  			r.hist[i] += b
   913  		}
   914  		r.pend = r.hist[1:]
   915  	}
   916  	return n, nil
   917  }
   918  
   919  // HasFilter returns whether v is a stream encoded with the specified filter.
   920  // (There may be other filters as well.)
   921  func (v Value) HasFilter(filterName string) bool {
   922  	if _, ok := v.data.(stream); !ok {
   923  		return false
   924  	}
   925  	filter := v.Key("Filter")
   926  	switch filter.Kind() {
   927  	case Name:
   928  		return filter.Name() == filterName
   929  	case Array:
   930  		for i := 0; i < filter.Len(); i++ {
   931  			if filter.Index(i).Name() == filterName {
   932  				return true
   933  			}
   934  		}
   935  	}
   936  
   937  	return false
   938  }
   939  
   940  // EncodedReader returns the data contained in the stream v.
   941  // It does not apply the specified filter, so the returned data will be in the
   942  // format that filter expects as input (assuming the stream actually has that
   943  // filter).
   944  // Before calling EncodedReader, you should check whether the stream has the
   945  // filter you are interested in, by calling HasFilter.
   946  func (v Value) EncodedReader(filterName string) io.Reader {
   947  	x, ok := v.data.(stream)
   948  	if !ok {
   949  		return &errorReadCloser{fmt.Errorf("stream not present")}
   950  	}
   951  	var rd io.Reader
   952  	rd = io.NewSectionReader(v.r.f, x.offset, v.Key("Length").Int64())
   953  	if v.r.key != nil {
   954  		rd = decryptStream(v.r.key, v.r.useAES, x.ptr, rd)
   955  	}
   956  	filter := v.Key("Filter")
   957  	param := v.Key("DecodeParms")
   958  	switch filter.Kind() {
   959  	default:
   960  		panic(fmt.Errorf("unsupported filter %v", filter))
   961  	case Null:
   962  		// ok
   963  	case Name:
   964  		if filter.Name() == filterName {
   965  			return rd
   966  		}
   967  		rd = applyFilter(rd, filter.Name(), param)
   968  	case Array:
   969  		for i := 0; i < filter.Len(); i++ {
   970  			if filter.Index(i).Name() == filterName {
   971  				return rd
   972  			}
   973  			rd = applyFilter(rd, filter.Index(i).Name(), param.Index(i))
   974  		}
   975  	}
   976  
   977  	return rd
   978  }
   979  
   980  var passwordPad = []byte{
   981  	0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08,
   982  	0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A,
   983  }
   984  
   985  func (r *Reader) initEncrypt(password string) error {
   986  	// See PDF 32000-1:2008, §7.6.
   987  	encrypt, _ := r.resolve(objptr{}, r.trailer["Encrypt"]).data.(dict)
   988  	if encrypt["Filter"] != name("Standard") {
   989  		return fmt.Errorf("unsupported PDF: encryption filter %v", objfmt(encrypt["Filter"]))
   990  	}
   991  	n, _ := encrypt["Length"].(int64)
   992  	if n == 0 {
   993  		n = 40
   994  	}
   995  	if n%8 != 0 || n > 128 || n < 40 {
   996  		return fmt.Errorf("malformed PDF: %d-bit encryption key", n)
   997  	}
   998  	V, _ := encrypt["V"].(int64)
   999  	if V != 1 && V != 2 && (V != 4 || !okayV4(encrypt)) {
  1000  		return fmt.Errorf("unsupported PDF: encryption version V=%d; %v", V, objfmt(encrypt))
  1001  	}
  1002  
  1003  	ids, ok := r.trailer["ID"].(array)
  1004  	if !ok || len(ids) < 1 {
  1005  		return fmt.Errorf("malformed PDF: missing ID in trailer")
  1006  	}
  1007  	idstr, ok := ids[0].(string)
  1008  	if !ok {
  1009  		return fmt.Errorf("malformed PDF: missing ID in trailer")
  1010  	}
  1011  	ID := []byte(idstr)
  1012  
  1013  	R, _ := encrypt["R"].(int64)
  1014  	if R < 2 {
  1015  		return fmt.Errorf("malformed PDF: encryption revision R=%d", R)
  1016  	}
  1017  	if R > 4 {
  1018  		return fmt.Errorf("unsupported PDF: encryption revision R=%d", R)
  1019  	}
  1020  	O, _ := encrypt["O"].(string)
  1021  	U, _ := encrypt["U"].(string)
  1022  	if len(O) != 32 || len(U) != 32 {
  1023  		return fmt.Errorf("malformed PDF: missing O= or U= encryption parameters")
  1024  	}
  1025  	p, _ := encrypt["P"].(int64)
  1026  	P := uint32(p)
  1027  
  1028  	// TODO: Password should be converted to Latin-1.
  1029  	pw := []byte(password)
  1030  	h := md5.New()
  1031  	if len(pw) >= 32 {
  1032  		h.Write(pw[:32])
  1033  	} else {
  1034  		h.Write(pw)
  1035  		h.Write(passwordPad[:32-len(pw)])
  1036  	}
  1037  	h.Write([]byte(O))
  1038  	h.Write([]byte{byte(P), byte(P >> 8), byte(P >> 16), byte(P >> 24)})
  1039  	h.Write([]byte(ID))
  1040  	key := h.Sum(nil)
  1041  
  1042  	if R >= 3 {
  1043  		for i := 0; i < 50; i++ {
  1044  			h.Reset()
  1045  			h.Write(key[:n/8])
  1046  			key = h.Sum(key[:0])
  1047  		}
  1048  		key = key[:n/8]
  1049  	} else {
  1050  		key = key[:40/8]
  1051  	}
  1052  
  1053  	c, err := rc4.NewCipher(key)
  1054  	if err != nil {
  1055  		return fmt.Errorf("malformed PDF: invalid RC4 key: %v", err)
  1056  	}
  1057  
  1058  	var u []byte
  1059  	if R == 2 {
  1060  		u = make([]byte, 32)
  1061  		copy(u, passwordPad)
  1062  		c.XORKeyStream(u, u)
  1063  	} else {
  1064  		h.Reset()
  1065  		h.Write(passwordPad)
  1066  		h.Write([]byte(ID))
  1067  		u = h.Sum(nil)
  1068  		c.XORKeyStream(u, u)
  1069  
  1070  		for i := 1; i <= 19; i++ {
  1071  			key1 := make([]byte, len(key))
  1072  			copy(key1, key)
  1073  			for j := range key1 {
  1074  				key1[j] ^= byte(i)
  1075  			}
  1076  			c, _ = rc4.NewCipher(key1)
  1077  			c.XORKeyStream(u, u)
  1078  		}
  1079  	}
  1080  
  1081  	if !bytes.HasPrefix([]byte(U), u) {
  1082  		return ErrInvalidPassword
  1083  	}
  1084  
  1085  	r.key = key
  1086  	r.useAES = V == 4
  1087  
  1088  	return nil
  1089  }
  1090  
  1091  var ErrInvalidPassword = fmt.Errorf("encrypted PDF: invalid password")
  1092  
  1093  func okayV4(encrypt dict) bool {
  1094  	cf, ok := encrypt["CF"].(dict)
  1095  	if !ok {
  1096  		return false
  1097  	}
  1098  	stmf, ok := encrypt["StmF"].(name)
  1099  	if !ok {
  1100  		return false
  1101  	}
  1102  	strf, ok := encrypt["StrF"].(name)
  1103  	if !ok {
  1104  		return false
  1105  	}
  1106  	if stmf != strf {
  1107  		return false
  1108  	}
  1109  	cfparam, ok := cf[stmf].(dict)
  1110  	if cfparam["AuthEvent"] != nil && cfparam["AuthEvent"] != name("DocOpen") {
  1111  		return false
  1112  	}
  1113  	if cfparam["Length"] != nil && cfparam["Length"] != int64(16) {
  1114  		return false
  1115  	}
  1116  	if cfparam["CFM"] != name("AESV2") {
  1117  		return false
  1118  	}
  1119  	return true
  1120  }
  1121  
  1122  func cryptKey(key []byte, useAES bool, ptr objptr) []byte {
  1123  	h := md5.New()
  1124  	h.Write(key)
  1125  	h.Write([]byte{byte(ptr.id), byte(ptr.id >> 8), byte(ptr.id >> 16), byte(ptr.gen), byte(ptr.gen >> 8)})
  1126  	if useAES {
  1127  		h.Write([]byte("sAlT"))
  1128  	}
  1129  	return h.Sum(nil)
  1130  }
  1131  
  1132  func decryptString(key []byte, useAES bool, ptr objptr, x string) string {
  1133  	key = cryptKey(key, useAES, ptr)
  1134  	if useAES {
  1135  		panic("AES not implemented")
  1136  	} else {
  1137  		c, _ := rc4.NewCipher(key)
  1138  		data := []byte(x)
  1139  		c.XORKeyStream(data, data)
  1140  		x = string(data)
  1141  	}
  1142  	return x
  1143  }
  1144  
  1145  func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader {
  1146  	key = cryptKey(key, useAES, ptr)
  1147  	if useAES {
  1148  		cb, err := aes.NewCipher(key)
  1149  		if err != nil {
  1150  			panic("AES: " + err.Error())
  1151  		}
  1152  		iv := make([]byte, 16)
  1153  		io.ReadFull(rd, iv)
  1154  		cbc := cipher.NewCBCDecrypter(cb, iv)
  1155  		rd = &cbcReader{cbc: cbc, rd: rd, buf: make([]byte, 16)}
  1156  	} else {
  1157  		c, _ := rc4.NewCipher(key)
  1158  		rd = &cipher.StreamReader{S: c, R: rd}
  1159  	}
  1160  	return rd
  1161  }
  1162  
  1163  type cbcReader struct {
  1164  	cbc  cipher.BlockMode
  1165  	rd   io.Reader
  1166  	buf  []byte
  1167  	pend []byte
  1168  }
  1169  
  1170  func (r *cbcReader) Read(b []byte) (n int, err error) {
  1171  	if len(r.pend) == 0 {
  1172  		_, err = io.ReadFull(r.rd, r.buf)
  1173  		if err != nil {
  1174  			return 0, err
  1175  		}
  1176  		r.cbc.CryptBlocks(r.buf, r.buf)
  1177  		r.pend = r.buf
  1178  	}
  1179  	n = copy(b, r.pend)
  1180  	r.pend = r.pend[n:]
  1181  	return n, nil
  1182  }