github.com/andybalholm/giopdf@v0.0.0-20220317170119-aad9a095ad48/pdf/read.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package pdf implements reading of PDF files. 6 // 7 // Overview 8 // 9 // PDF is Adobe's Portable Document Format, ubiquitous on the internet. 10 // A PDF document is a complex data format built on a fairly simple structure. 11 // This package exposes the simple structure along with some wrappers to 12 // extract basic information. If more complex information is needed, it is 13 // possible to extract that information by interpreting the structure exposed 14 // by this package. 15 // 16 // Specifically, a PDF is a data structure built from Values, each of which has 17 // one of the following Kinds: 18 // 19 // Null, for the null object. 20 // Integer, for an integer. 21 // Real, for a floating-point number. 22 // Bool, for a boolean value. 23 // Name, for a name constant (as in /Helvetica). 24 // String, for a string constant. 25 // Dict, for a dictionary of name-value pairs. 26 // Array, for an array of values. 27 // Stream, for an opaque data stream and associated header dictionary. 28 // 29 // The accessors on Value—Int64, Float64, Bool, Name, and so on—return 30 // a view of the data as the given type. When there is no appropriate view, 31 // the accessor returns a zero result. For example, the Name accessor returns 32 // the empty string if called on a Value v for which v.Kind() != Name. 33 // Returning zero values this way, especially from the Dict and Array accessors, 34 // which themselves return Values, makes it possible to traverse a PDF quickly 35 // without writing any error checking. On the other hand, it means that mistakes 36 // can go unreported. 37 // 38 // The basic structure of the PDF file is exposed as the graph of Values. 39 // 40 // Most richer data structures in a PDF file are dictionaries with specific interpretations 41 // of the name-value pairs. The Font and Page wrappers make the interpretation 42 // of a specific Value as the corresponding type easier. They are only helpers, though: 43 // they are implemented only in terms of the Value API and could be moved outside 44 // the package. Equally important, traversal of other PDF data structures can be implemented 45 // in other packages as needed. 46 // 47 package pdf // import "rsc.io/pdf" 48 49 // BUG(rsc): The package is incomplete, although it has been used successfully on some 50 // large real-world PDF files. 51 52 // BUG(rsc): There is no support for closing open PDF files. If you drop all references to a Reader, 53 // the underlying reader will eventually be garbage collected. 54 55 // BUG(rsc): The library makes no attempt at efficiency. A value cache maintained in the Reader 56 // would probably help significantly. 57 58 // BUG(rsc): The support for reading encrypted files is weak. 59 60 // BUG(rsc): The Value API does not support error reporting. The intent is to allow users to 61 // set an error reporting callback in Reader, but that code has not been implemented. 62 63 import ( 64 "bytes" 65 "compress/zlib" 66 "crypto/aes" 67 "crypto/cipher" 68 "crypto/md5" 69 "crypto/rc4" 70 "fmt" 71 "io" 72 "io/ioutil" 73 "os" 74 "sort" 75 "strconv" 76 77 "golang.org/x/image/ccitt" 78 ) 79 80 // A Reader is a single PDF file open for reading. 81 type Reader struct { 82 f io.ReaderAt 83 end int64 84 xref []xref 85 trailer dict 86 trailerptr objptr 87 key []byte 88 useAES bool 89 } 90 91 type xref struct { 92 ptr objptr 93 inStream bool 94 stream objptr 95 offset int64 96 } 97 98 func (r *Reader) errorf(format string, args ...interface{}) { 99 panic(fmt.Errorf(format, args...)) 100 } 101 102 // Open opens a file for reading. 103 func Open(file string) (*Reader, error) { 104 // TODO: Deal with closing file. 105 f, err := os.Open(file) 106 if err != nil { 107 return nil, err 108 } 109 fi, err := f.Stat() 110 if err != nil { 111 f.Close() 112 return nil, err 113 } 114 return NewReader(f, fi.Size()) 115 } 116 117 // NewReader opens a file for reading, using the data in f with the given total size. 118 func NewReader(f io.ReaderAt, size int64) (*Reader, error) { 119 return NewReaderEncrypted(f, size, nil) 120 } 121 122 // NewReaderEncrypted opens a file for reading, using the data in f with the given total size. 123 // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords 124 // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt 125 // the file and returns an error. 126 func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { 127 buf := make([]byte, 10) 128 f.ReadAt(buf, 0) 129 if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' && buf[8] != ' ' { 130 return nil, fmt.Errorf("not a PDF file: invalid header") 131 } 132 end := size 133 const endChunk = 100 134 buf = make([]byte, endChunk) 135 f.ReadAt(buf, end-endChunk) 136 for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' { 137 buf = buf[:len(buf)-1] 138 } 139 buf = bytes.TrimRight(buf, "\r\n\t ") 140 if !bytes.HasSuffix(buf, []byte("%%EOF")) { 141 return nil, fmt.Errorf("not a PDF file: missing %%%%EOF") 142 } 143 i := findLastLine(buf, "startxref") 144 if i < 0 { 145 return nil, fmt.Errorf("malformed PDF file: missing final startxref") 146 } 147 148 r := &Reader{ 149 f: f, 150 end: end, 151 } 152 pos := end - endChunk + int64(i) 153 b := newBuffer(io.NewSectionReader(f, pos, end-pos), pos) 154 if b.readToken() != keyword("startxref") { 155 return nil, fmt.Errorf("malformed PDF file: missing startxref") 156 } 157 startxref, ok := b.readToken().(int64) 158 if !ok { 159 return nil, fmt.Errorf("malformed PDF file: startxref not followed by integer") 160 } 161 b = newBuffer(io.NewSectionReader(r.f, startxref, r.end-startxref), startxref) 162 xref, trailerptr, trailer, err := readXref(r, b) 163 if err != nil { 164 return nil, err 165 } 166 r.xref = xref 167 r.trailer = trailer 168 r.trailerptr = trailerptr 169 if trailer["Encrypt"] == nil { 170 return r, nil 171 } 172 err = r.initEncrypt("") 173 if err == nil { 174 return r, nil 175 } 176 if pw == nil || err != ErrInvalidPassword { 177 return nil, err 178 } 179 for { 180 next := pw() 181 if next == "" { 182 break 183 } 184 if r.initEncrypt(next) == nil { 185 return r, nil 186 } 187 } 188 return nil, err 189 } 190 191 // Trailer returns the file's Trailer value. 192 func (r *Reader) Trailer() Value { 193 return Value{r, r.trailerptr, r.trailer} 194 } 195 196 func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) { 197 tok := b.readToken() 198 if tok == keyword("xref") { 199 return readXrefTable(r, b) 200 } 201 if _, ok := tok.(int64); ok { 202 b.unreadToken(tok) 203 return readXrefStream(r, b) 204 } 205 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", tok) 206 } 207 208 func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { 209 obj1 := b.readObject() 210 obj, ok := obj1.(objdef) 211 if !ok { 212 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj1)) 213 } 214 strmptr := obj.ptr 215 strm, ok := obj.obj.(stream) 216 if !ok { 217 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj)) 218 } 219 if strm.hdr["Type"] != name("XRef") { 220 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream does not have type XRef") 221 } 222 size, ok := strm.hdr["Size"].(int64) 223 if !ok { 224 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream missing Size") 225 } 226 table := make([]xref, size) 227 228 table, err := readXrefStreamData(r, strm, table, size) 229 if err != nil { 230 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) 231 } 232 233 for prevoff := strm.hdr["Prev"]; prevoff != nil; { 234 off, ok := prevoff.(int64) 235 if !ok { 236 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff) 237 } 238 b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off) 239 obj1 := b.readObject() 240 obj, ok := obj1.(objdef) 241 if !ok { 242 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj1)) 243 } 244 prevstrm, ok := obj.obj.(stream) 245 if !ok { 246 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj)) 247 } 248 prevoff = prevstrm.hdr["Prev"] 249 prev := Value{r, objptr{}, prevstrm} 250 if prev.Kind() != Stream { 251 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream is not stream: %v", prev) 252 } 253 if prev.Key("Type").Name() != "XRef" { 254 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream does not have type XRef") 255 } 256 psize := prev.Key("Size").Int64() 257 if psize > size { 258 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream larger than last stream") 259 } 260 if table, err = readXrefStreamData(r, prev.data.(stream), table, psize); err != nil { 261 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: reading xref prev stream: %v", err) 262 } 263 } 264 265 return table, strmptr, strm.hdr, nil 266 } 267 268 func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xref, error) { 269 index, _ := strm.hdr["Index"].(array) 270 if index == nil { 271 index = array{int64(0), size} 272 } 273 if len(index)%2 != 0 { 274 return nil, fmt.Errorf("invalid Index array %v", objfmt(index)) 275 } 276 ww, ok := strm.hdr["W"].(array) 277 if !ok { 278 return nil, fmt.Errorf("xref stream missing W array") 279 } 280 281 var w []int 282 for _, x := range ww { 283 i, ok := x.(int64) 284 if !ok || int64(int(i)) != i { 285 return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) 286 } 287 w = append(w, int(i)) 288 } 289 if len(w) < 3 { 290 return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) 291 } 292 293 v := Value{r, objptr{}, strm} 294 wtotal := 0 295 for _, wid := range w { 296 wtotal += wid 297 } 298 buf := make([]byte, wtotal) 299 data := v.Reader() 300 for len(index) > 0 { 301 start, ok1 := index[0].(int64) 302 n, ok2 := index[1].(int64) 303 if !ok1 || !ok2 { 304 return nil, fmt.Errorf("malformed Index pair %v %v %T %T", objfmt(index[0]), objfmt(index[1]), index[0], index[1]) 305 } 306 index = index[2:] 307 for i := 0; i < int(n); i++ { 308 _, err := io.ReadFull(data, buf) 309 if err != nil { 310 return nil, fmt.Errorf("error reading xref stream: %v", err) 311 } 312 v1 := decodeInt(buf[0:w[0]]) 313 if w[0] == 0 { 314 v1 = 1 315 } 316 v2 := decodeInt(buf[w[0] : w[0]+w[1]]) 317 v3 := decodeInt(buf[w[0]+w[1] : w[0]+w[1]+w[2]]) 318 x := int(start) + i 319 for cap(table) <= x { 320 table = append(table[:cap(table)], xref{}) 321 } 322 if table[x].ptr != (objptr{}) { 323 continue 324 } 325 switch v1 { 326 case 0: 327 table[x] = xref{ptr: objptr{0, 65535}} 328 case 1: 329 table[x] = xref{ptr: objptr{uint32(x), uint16(v3)}, offset: int64(v2)} 330 case 2: 331 table[x] = xref{ptr: objptr{uint32(x), 0}, inStream: true, stream: objptr{uint32(v2), 0}, offset: int64(v3)} 332 default: 333 fmt.Printf("invalid xref stream type %d: %x\n", v1, buf) 334 } 335 } 336 } 337 return table, nil 338 } 339 340 func decodeInt(b []byte) int { 341 x := 0 342 for _, c := range b { 343 x = x<<8 | int(c) 344 } 345 return x 346 } 347 348 func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) { 349 var table []xref 350 351 table, err := readXrefTableData(b, table) 352 if err != nil { 353 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) 354 } 355 356 trailer, ok := b.readObject().(dict) 357 if !ok { 358 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref table not followed by trailer dictionary") 359 } 360 361 for prevoff := trailer["Prev"]; prevoff != nil; { 362 off, ok := prevoff.(int64) 363 if !ok { 364 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff) 365 } 366 b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off) 367 tok := b.readToken() 368 if tok != keyword("xref") { 369 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev does not point to xref") 370 } 371 table, err = readXrefTableData(b, table) 372 if err != nil { 373 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) 374 } 375 376 trailer, ok := b.readObject().(dict) 377 if !ok { 378 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev table not followed by trailer dictionary") 379 } 380 prevoff = trailer["Prev"] 381 } 382 383 size, ok := trailer[name("Size")].(int64) 384 if !ok { 385 return nil, objptr{}, nil, fmt.Errorf("malformed PDF: trailer missing /Size entry") 386 } 387 388 if size < int64(len(table)) { 389 table = table[:size] 390 } 391 392 return table, objptr{}, trailer, nil 393 } 394 395 func readXrefTableData(b *buffer, table []xref) ([]xref, error) { 396 for { 397 tok := b.readToken() 398 if tok == keyword("trailer") { 399 break 400 } 401 start, ok1 := tok.(int64) 402 n, ok2 := b.readToken().(int64) 403 if !ok1 || !ok2 { 404 return nil, fmt.Errorf("malformed xref table") 405 } 406 for i := 0; i < int(n); i++ { 407 off, ok1 := b.readToken().(int64) 408 gen, ok2 := b.readToken().(int64) 409 alloc, ok3 := b.readToken().(keyword) 410 if !ok1 || !ok2 || !ok3 || alloc != keyword("f") && alloc != keyword("n") { 411 return nil, fmt.Errorf("malformed xref table") 412 } 413 x := int(start) + i 414 for cap(table) <= x { 415 table = append(table[:cap(table)], xref{}) 416 } 417 if len(table) <= x { 418 table = table[:x+1] 419 } 420 if alloc == "n" && table[x].offset == 0 { 421 table[x] = xref{ptr: objptr{uint32(x), uint16(gen)}, offset: int64(off)} 422 } 423 } 424 } 425 return table, nil 426 } 427 428 func findLastLine(buf []byte, s string) int { 429 bs := []byte(s) 430 max := len(buf) 431 for { 432 i := bytes.LastIndex(buf[:max], bs) 433 if i <= 0 || i+len(bs) >= len(buf) { 434 return -1 435 } 436 if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') { 437 return i 438 } 439 max = i 440 } 441 } 442 443 // A Value is a single PDF value, such as an integer, dictionary, or array. 444 // The zero Value is a PDF null (Kind() == Null, IsNull() = true). 445 type Value struct { 446 r *Reader 447 ptr objptr 448 data interface{} 449 } 450 451 // IsNull reports whether the value is a null. It is equivalent to Kind() == Null. 452 func (v Value) IsNull() bool { 453 return v.data == nil 454 } 455 456 // A ValueKind specifies the kind of data underlying a Value. 457 type ValueKind int 458 459 // The PDF value kinds. 460 const ( 461 Null ValueKind = iota 462 Bool 463 Integer 464 Real 465 String 466 Name 467 Dict 468 Array 469 Stream 470 ) 471 472 // Kind reports the kind of value underlying v. 473 func (v Value) Kind() ValueKind { 474 switch v.data.(type) { 475 default: 476 return Null 477 case bool: 478 return Bool 479 case int64: 480 return Integer 481 case float64: 482 return Real 483 case string: 484 return String 485 case name: 486 return Name 487 case dict: 488 return Dict 489 case array: 490 return Array 491 case stream: 492 return Stream 493 } 494 } 495 496 // String returns a textual representation of the value v. 497 // Note that String is not the accessor for values with Kind() == String. 498 // To access such values, see RawString, Text, and TextFromUTF16. 499 func (v Value) String() string { 500 return objfmt(v.data) 501 } 502 503 func objfmt(x interface{}) string { 504 switch x := x.(type) { 505 default: 506 return fmt.Sprint(x) 507 case string: 508 if isPDFDocEncoded(x) { 509 return strconv.Quote(pdfDocDecode(x)) 510 } 511 if isUTF16(x) { 512 return strconv.Quote(utf16Decode(x[2:])) 513 } 514 return strconv.Quote(x) 515 case name: 516 return "/" + string(x) 517 case dict: 518 var keys []string 519 for k := range x { 520 keys = append(keys, string(k)) 521 } 522 sort.Strings(keys) 523 var buf bytes.Buffer 524 buf.WriteString("<<") 525 for i, k := range keys { 526 elem := x[name(k)] 527 if i > 0 { 528 buf.WriteString(" ") 529 } 530 buf.WriteString("/") 531 buf.WriteString(k) 532 buf.WriteString(" ") 533 buf.WriteString(objfmt(elem)) 534 } 535 buf.WriteString(">>") 536 return buf.String() 537 538 case array: 539 var buf bytes.Buffer 540 buf.WriteString("[") 541 for i, elem := range x { 542 if i > 0 { 543 buf.WriteString(" ") 544 } 545 buf.WriteString(objfmt(elem)) 546 } 547 buf.WriteString("]") 548 return buf.String() 549 550 case stream: 551 return fmt.Sprintf("%v@%d", objfmt(x.hdr), x.offset) 552 553 case objptr: 554 return fmt.Sprintf("%d %d R", x.id, x.gen) 555 556 case objdef: 557 return fmt.Sprintf("{%d %d obj}%v", x.ptr.id, x.ptr.gen, objfmt(x.obj)) 558 } 559 } 560 561 // Bool returns v's boolean value. 562 // If v.Kind() != Bool, Bool returns false. 563 func (v Value) Bool() bool { 564 x, ok := v.data.(bool) 565 if !ok { 566 return false 567 } 568 return x 569 } 570 571 // Int64 returns v's int64 value. 572 // If v.Kind() != Int64, Int64 returns 0. 573 func (v Value) Int64() int64 { 574 x, ok := v.data.(int64) 575 if !ok { 576 return 0 577 } 578 return x 579 } 580 581 // Int returns v's int value, converting from int64. 582 // If v.Kind() != Int64, Int returns 0. 583 func (v Value) Int() int { 584 x, ok := v.data.(int64) 585 if !ok { 586 return 0 587 } 588 return int(x) 589 } 590 591 // Float64 returns v's float64 value, converting from integer if necessary. 592 // If v.Kind() != Float64 and v.Kind() != Int64, Float64 returns 0. 593 func (v Value) Float64() float64 { 594 x, ok := v.data.(float64) 595 if !ok { 596 x, ok := v.data.(int64) 597 if ok { 598 return float64(x) 599 } 600 return 0 601 } 602 return x 603 } 604 605 // Float32 returns v's float32 value, converting from integer or float64. 606 // If v.Kind() != Float64 and v.Kind() != Int64, Float32 returns 0. 607 func (v Value) Float32() float32 { 608 x, ok := v.data.(float64) 609 if !ok { 610 x, ok := v.data.(int64) 611 if ok { 612 return float32(x) 613 } 614 return 0 615 } 616 return float32(x) 617 } 618 619 // RawString returns v's string value. 620 // If v.Kind() != String, RawString returns the empty string. 621 func (v Value) RawString() string { 622 x, ok := v.data.(string) 623 if !ok { 624 return "" 625 } 626 return x 627 } 628 629 // Text returns v's string value interpreted as a ``text string'' (defined in the PDF spec) 630 // and converted to UTF-8. 631 // If v.Kind() != String, Text returns the empty string. 632 func (v Value) Text() string { 633 x, ok := v.data.(string) 634 if !ok { 635 return "" 636 } 637 if isPDFDocEncoded(x) { 638 return pdfDocDecode(x) 639 } 640 if isUTF16(x) { 641 return utf16Decode(x[2:]) 642 } 643 return x 644 } 645 646 // TextFromUTF16 returns v's string value interpreted as big-endian UTF-16 647 // and then converted to UTF-8. 648 // If v.Kind() != String or if the data is not valid UTF-16, TextFromUTF16 returns 649 // the empty string. 650 func (v Value) TextFromUTF16() string { 651 x, ok := v.data.(string) 652 if !ok { 653 return "" 654 } 655 if len(x)%2 == 1 { 656 return "" 657 } 658 if x == "" { 659 return "" 660 } 661 return utf16Decode(x) 662 } 663 664 // Name returns v's name value. 665 // If v.Kind() != Name, Name returns the empty string. 666 // The returned name does not include the leading slash: 667 // if v corresponds to the name written using the syntax /Helvetica, 668 // Name() == "Helvetica". 669 func (v Value) Name() string { 670 x, ok := v.data.(name) 671 if !ok { 672 return "" 673 } 674 return string(x) 675 } 676 677 // Key returns the value associated with the given name key in the dictionary v. 678 // Like the result of the Name method, the key should not include a leading slash. 679 // If v is a stream, Key applies to the stream's header dictionary. 680 // If v.Kind() != Dict and v.Kind() != Stream, Key returns a null Value. 681 func (v Value) Key(key string) Value { 682 x, ok := v.data.(dict) 683 if !ok { 684 strm, ok := v.data.(stream) 685 if !ok { 686 return Value{} 687 } 688 x = strm.hdr 689 } 690 return v.r.resolve(v.ptr, x[name(key)]) 691 } 692 693 // Keys returns a sorted list of the keys in the dictionary v. 694 // If v is a stream, Keys applies to the stream's header dictionary. 695 // If v.Kind() != Dict and v.Kind() != Stream, Keys returns nil. 696 func (v Value) Keys() []string { 697 x, ok := v.data.(dict) 698 if !ok { 699 strm, ok := v.data.(stream) 700 if !ok { 701 return nil 702 } 703 x = strm.hdr 704 } 705 keys := []string{} // not nil 706 for k := range x { 707 keys = append(keys, string(k)) 708 } 709 sort.Strings(keys) 710 return keys 711 } 712 713 // Index returns the i'th element in the array v. 714 // If v.Kind() != Array or if i is outside the array bounds, 715 // Index returns a null Value. 716 func (v Value) Index(i int) Value { 717 x, ok := v.data.(array) 718 if !ok || i < 0 || i >= len(x) { 719 return Value{} 720 } 721 return v.r.resolve(v.ptr, x[i]) 722 } 723 724 // Len returns the length of the array v. 725 // If v.Kind() != Array, Len returns 0. 726 func (v Value) Len() int { 727 x, ok := v.data.(array) 728 if !ok { 729 return 0 730 } 731 return len(x) 732 } 733 734 func (r *Reader) resolve(parent objptr, x interface{}) Value { 735 if ptr, ok := x.(objptr); ok { 736 if ptr.id >= uint32(len(r.xref)) { 737 return Value{} 738 } 739 xref := r.xref[ptr.id] 740 if xref.ptr != ptr || !xref.inStream && xref.offset == 0 { 741 return Value{} 742 } 743 var obj object 744 if xref.inStream { 745 strm := r.resolve(parent, xref.stream) 746 Search: 747 for { 748 if strm.Kind() != Stream { 749 panic("not a stream") 750 } 751 if strm.Key("Type").Name() != "ObjStm" { 752 panic("not an object stream") 753 } 754 n := int(strm.Key("N").Int64()) 755 first := strm.Key("First").Int64() 756 if first == 0 { 757 panic("missing First") 758 } 759 b := newBuffer(strm.Reader(), 0) 760 b.allowEOF = true 761 for i := 0; i < n; i++ { 762 id, _ := b.readToken().(int64) 763 off, _ := b.readToken().(int64) 764 if uint32(id) == ptr.id { 765 b.seekForward(first + off) 766 x = b.readObject() 767 break Search 768 } 769 } 770 ext := strm.Key("Extends") 771 if ext.Kind() != Stream { 772 panic("cannot find object in stream") 773 } 774 strm = ext 775 } 776 } else { 777 b := newBuffer(io.NewSectionReader(r.f, xref.offset, r.end-xref.offset), xref.offset) 778 b.key = r.key 779 b.useAES = r.useAES 780 obj = b.readObject() 781 def, ok := obj.(objdef) 782 if !ok { 783 panic(fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)) 784 //return Value{} 785 } 786 if def.ptr != ptr { 787 panic(fmt.Errorf("loading %v: found %v", ptr, def.ptr)) 788 } 789 x = def.obj 790 } 791 parent = ptr 792 } 793 794 switch x := x.(type) { 795 case nil, bool, int64, float64, name, dict, array, stream: 796 return Value{r, parent, x} 797 case string: 798 return Value{r, parent, x} 799 default: 800 panic(fmt.Errorf("unexpected value type %T in resolve", x)) 801 } 802 } 803 804 type errorReadCloser struct { 805 err error 806 } 807 808 func (e *errorReadCloser) Read([]byte) (int, error) { 809 return 0, e.err 810 } 811 812 func (e *errorReadCloser) Close() error { 813 return e.err 814 } 815 816 // Reader returns the data contained in the stream v. 817 // If v.Kind() != Stream, Reader returns a ReadCloser that 818 // responds to all reads with a ``stream not present'' error. 819 func (v Value) Reader() io.ReadCloser { 820 x, ok := v.data.(stream) 821 if !ok { 822 return &errorReadCloser{fmt.Errorf("stream not present")} 823 } 824 var rd io.Reader 825 rd = io.NewSectionReader(v.r.f, x.offset, v.Key("Length").Int64()) 826 if v.r.key != nil { 827 rd = decryptStream(v.r.key, v.r.useAES, x.ptr, rd) 828 } 829 filter := v.Key("Filter") 830 param := v.Key("DecodeParms") 831 switch filter.Kind() { 832 default: 833 panic(fmt.Errorf("unsupported filter %v", filter)) 834 case Null: 835 // ok 836 case Name: 837 rd = applyFilter(rd, filter.Name(), param) 838 case Array: 839 for i := 0; i < filter.Len(); i++ { 840 rd = applyFilter(rd, filter.Index(i).Name(), param.Index(i)) 841 } 842 } 843 844 return ioutil.NopCloser(rd) 845 } 846 847 func applyFilter(rd io.Reader, name string, param Value) io.Reader { 848 switch name { 849 default: 850 panic("unknown filter " + name) 851 case "FlateDecode": 852 zr, err := zlib.NewReader(rd) 853 if err != nil { 854 panic(err) 855 } 856 pred := param.Key("Predictor") 857 if pred.Kind() == Null { 858 return zr 859 } 860 columns := param.Key("Columns").Int64() 861 switch pred.Int64() { 862 default: 863 fmt.Println("unknown predictor", pred) 864 panic("pred") 865 case 12: 866 return &pngUpReader{r: zr, hist: make([]byte, 1+columns), tmp: make([]byte, 1+columns)} 867 } 868 869 case "CCITTFaxDecode": 870 sf := ccitt.Group3 871 if param.Key("K").Int() < 0 { 872 sf = ccitt.Group4 873 } 874 width := 1728 875 if cols := param.Key("Columns"); !cols.IsNull() { 876 width = cols.Int() 877 } 878 height := ccitt.AutoDetectHeight 879 if rows := param.Key("Rows").Int(); rows != 0 { 880 height = rows 881 } 882 invert := param.Key("BlackIs1").Bool() 883 return ccitt.NewReader(rd, ccitt.MSB, sf, width, height, &ccitt.Options{Invert: invert}) 884 } 885 } 886 887 type pngUpReader struct { 888 r io.Reader 889 hist []byte 890 tmp []byte 891 pend []byte 892 } 893 894 func (r *pngUpReader) Read(b []byte) (int, error) { 895 n := 0 896 for len(b) > 0 { 897 if len(r.pend) > 0 { 898 m := copy(b, r.pend) 899 n += m 900 b = b[m:] 901 r.pend = r.pend[m:] 902 continue 903 } 904 _, err := io.ReadFull(r.r, r.tmp) 905 if err != nil { 906 return n, err 907 } 908 if r.tmp[0] != 2 { 909 return n, fmt.Errorf("malformed PNG-Up encoding") 910 } 911 for i, b := range r.tmp { 912 r.hist[i] += b 913 } 914 r.pend = r.hist[1:] 915 } 916 return n, nil 917 } 918 919 // HasFilter returns whether v is a stream encoded with the specified filter. 920 // (There may be other filters as well.) 921 func (v Value) HasFilter(filterName string) bool { 922 if _, ok := v.data.(stream); !ok { 923 return false 924 } 925 filter := v.Key("Filter") 926 switch filter.Kind() { 927 case Name: 928 return filter.Name() == filterName 929 case Array: 930 for i := 0; i < filter.Len(); i++ { 931 if filter.Index(i).Name() == filterName { 932 return true 933 } 934 } 935 } 936 937 return false 938 } 939 940 // EncodedReader returns the data contained in the stream v. 941 // It does not apply the specified filter, so the returned data will be in the 942 // format that filter expects as input (assuming the stream actually has that 943 // filter). 944 // Before calling EncodedReader, you should check whether the stream has the 945 // filter you are interested in, by calling HasFilter. 946 func (v Value) EncodedReader(filterName string) io.Reader { 947 x, ok := v.data.(stream) 948 if !ok { 949 return &errorReadCloser{fmt.Errorf("stream not present")} 950 } 951 var rd io.Reader 952 rd = io.NewSectionReader(v.r.f, x.offset, v.Key("Length").Int64()) 953 if v.r.key != nil { 954 rd = decryptStream(v.r.key, v.r.useAES, x.ptr, rd) 955 } 956 filter := v.Key("Filter") 957 param := v.Key("DecodeParms") 958 switch filter.Kind() { 959 default: 960 panic(fmt.Errorf("unsupported filter %v", filter)) 961 case Null: 962 // ok 963 case Name: 964 if filter.Name() == filterName { 965 return rd 966 } 967 rd = applyFilter(rd, filter.Name(), param) 968 case Array: 969 for i := 0; i < filter.Len(); i++ { 970 if filter.Index(i).Name() == filterName { 971 return rd 972 } 973 rd = applyFilter(rd, filter.Index(i).Name(), param.Index(i)) 974 } 975 } 976 977 return rd 978 } 979 980 var passwordPad = []byte{ 981 0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08, 982 0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A, 983 } 984 985 func (r *Reader) initEncrypt(password string) error { 986 // See PDF 32000-1:2008, §7.6. 987 encrypt, _ := r.resolve(objptr{}, r.trailer["Encrypt"]).data.(dict) 988 if encrypt["Filter"] != name("Standard") { 989 return fmt.Errorf("unsupported PDF: encryption filter %v", objfmt(encrypt["Filter"])) 990 } 991 n, _ := encrypt["Length"].(int64) 992 if n == 0 { 993 n = 40 994 } 995 if n%8 != 0 || n > 128 || n < 40 { 996 return fmt.Errorf("malformed PDF: %d-bit encryption key", n) 997 } 998 V, _ := encrypt["V"].(int64) 999 if V != 1 && V != 2 && (V != 4 || !okayV4(encrypt)) { 1000 return fmt.Errorf("unsupported PDF: encryption version V=%d; %v", V, objfmt(encrypt)) 1001 } 1002 1003 ids, ok := r.trailer["ID"].(array) 1004 if !ok || len(ids) < 1 { 1005 return fmt.Errorf("malformed PDF: missing ID in trailer") 1006 } 1007 idstr, ok := ids[0].(string) 1008 if !ok { 1009 return fmt.Errorf("malformed PDF: missing ID in trailer") 1010 } 1011 ID := []byte(idstr) 1012 1013 R, _ := encrypt["R"].(int64) 1014 if R < 2 { 1015 return fmt.Errorf("malformed PDF: encryption revision R=%d", R) 1016 } 1017 if R > 4 { 1018 return fmt.Errorf("unsupported PDF: encryption revision R=%d", R) 1019 } 1020 O, _ := encrypt["O"].(string) 1021 U, _ := encrypt["U"].(string) 1022 if len(O) != 32 || len(U) != 32 { 1023 return fmt.Errorf("malformed PDF: missing O= or U= encryption parameters") 1024 } 1025 p, _ := encrypt["P"].(int64) 1026 P := uint32(p) 1027 1028 // TODO: Password should be converted to Latin-1. 1029 pw := []byte(password) 1030 h := md5.New() 1031 if len(pw) >= 32 { 1032 h.Write(pw[:32]) 1033 } else { 1034 h.Write(pw) 1035 h.Write(passwordPad[:32-len(pw)]) 1036 } 1037 h.Write([]byte(O)) 1038 h.Write([]byte{byte(P), byte(P >> 8), byte(P >> 16), byte(P >> 24)}) 1039 h.Write([]byte(ID)) 1040 key := h.Sum(nil) 1041 1042 if R >= 3 { 1043 for i := 0; i < 50; i++ { 1044 h.Reset() 1045 h.Write(key[:n/8]) 1046 key = h.Sum(key[:0]) 1047 } 1048 key = key[:n/8] 1049 } else { 1050 key = key[:40/8] 1051 } 1052 1053 c, err := rc4.NewCipher(key) 1054 if err != nil { 1055 return fmt.Errorf("malformed PDF: invalid RC4 key: %v", err) 1056 } 1057 1058 var u []byte 1059 if R == 2 { 1060 u = make([]byte, 32) 1061 copy(u, passwordPad) 1062 c.XORKeyStream(u, u) 1063 } else { 1064 h.Reset() 1065 h.Write(passwordPad) 1066 h.Write([]byte(ID)) 1067 u = h.Sum(nil) 1068 c.XORKeyStream(u, u) 1069 1070 for i := 1; i <= 19; i++ { 1071 key1 := make([]byte, len(key)) 1072 copy(key1, key) 1073 for j := range key1 { 1074 key1[j] ^= byte(i) 1075 } 1076 c, _ = rc4.NewCipher(key1) 1077 c.XORKeyStream(u, u) 1078 } 1079 } 1080 1081 if !bytes.HasPrefix([]byte(U), u) { 1082 return ErrInvalidPassword 1083 } 1084 1085 r.key = key 1086 r.useAES = V == 4 1087 1088 return nil 1089 } 1090 1091 var ErrInvalidPassword = fmt.Errorf("encrypted PDF: invalid password") 1092 1093 func okayV4(encrypt dict) bool { 1094 cf, ok := encrypt["CF"].(dict) 1095 if !ok { 1096 return false 1097 } 1098 stmf, ok := encrypt["StmF"].(name) 1099 if !ok { 1100 return false 1101 } 1102 strf, ok := encrypt["StrF"].(name) 1103 if !ok { 1104 return false 1105 } 1106 if stmf != strf { 1107 return false 1108 } 1109 cfparam, ok := cf[stmf].(dict) 1110 if cfparam["AuthEvent"] != nil && cfparam["AuthEvent"] != name("DocOpen") { 1111 return false 1112 } 1113 if cfparam["Length"] != nil && cfparam["Length"] != int64(16) { 1114 return false 1115 } 1116 if cfparam["CFM"] != name("AESV2") { 1117 return false 1118 } 1119 return true 1120 } 1121 1122 func cryptKey(key []byte, useAES bool, ptr objptr) []byte { 1123 h := md5.New() 1124 h.Write(key) 1125 h.Write([]byte{byte(ptr.id), byte(ptr.id >> 8), byte(ptr.id >> 16), byte(ptr.gen), byte(ptr.gen >> 8)}) 1126 if useAES { 1127 h.Write([]byte("sAlT")) 1128 } 1129 return h.Sum(nil) 1130 } 1131 1132 func decryptString(key []byte, useAES bool, ptr objptr, x string) string { 1133 key = cryptKey(key, useAES, ptr) 1134 if useAES { 1135 panic("AES not implemented") 1136 } else { 1137 c, _ := rc4.NewCipher(key) 1138 data := []byte(x) 1139 c.XORKeyStream(data, data) 1140 x = string(data) 1141 } 1142 return x 1143 } 1144 1145 func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader { 1146 key = cryptKey(key, useAES, ptr) 1147 if useAES { 1148 cb, err := aes.NewCipher(key) 1149 if err != nil { 1150 panic("AES: " + err.Error()) 1151 } 1152 iv := make([]byte, 16) 1153 io.ReadFull(rd, iv) 1154 cbc := cipher.NewCBCDecrypter(cb, iv) 1155 rd = &cbcReader{cbc: cbc, rd: rd, buf: make([]byte, 16)} 1156 } else { 1157 c, _ := rc4.NewCipher(key) 1158 rd = &cipher.StreamReader{S: c, R: rd} 1159 } 1160 return rd 1161 } 1162 1163 type cbcReader struct { 1164 cbc cipher.BlockMode 1165 rd io.Reader 1166 buf []byte 1167 pend []byte 1168 } 1169 1170 func (r *cbcReader) Read(b []byte) (n int, err error) { 1171 if len(r.pend) == 0 { 1172 _, err = io.ReadFull(r.rd, r.buf) 1173 if err != nil { 1174 return 0, err 1175 } 1176 r.cbc.CryptBlocks(r.buf, r.buf) 1177 r.pend = r.buf 1178 } 1179 n = copy(b, r.pend) 1180 r.pend = r.pend[n:] 1181 return n, nil 1182 }