github.com/pdfcpu/pdfcpu@v0.11.1/pkg/pdfcpu/extract.go (about) 1 /* 2 Copyright 2018 The pdfcpu Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package pdfcpu 18 19 import ( 20 "bytes" 21 "fmt" 22 "io" 23 "strings" 24 25 "github.com/pdfcpu/pdfcpu/pkg/filter" 26 "github.com/pdfcpu/pdfcpu/pkg/log" 27 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/font" 28 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" 29 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" 30 "github.com/pkg/errors" 31 ) 32 33 // ImageObjNrs returns all image dict objNrs for pageNr. 34 // Requires an optimized context. 35 func ImageObjNrs(ctx *model.Context, pageNr int) []int { 36 // TODO Exclude SMask image objects. 37 objNrs := []int{} 38 39 if pageNr < 1 { 40 return objNrs 41 } 42 43 imgObjNrs := ctx.Optimize.PageImages 44 if len(imgObjNrs) == 0 { 45 return objNrs 46 } 47 48 pageImgObjNrs := imgObjNrs[pageNr-1] 49 if pageImgObjNrs == nil { 50 return objNrs 51 } 52 53 for k, v := range pageImgObjNrs { 54 if v { 55 objNrs = append(objNrs, k) 56 } 57 } 58 return objNrs 59 } 60 61 // StreamLength returns sd's stream length. 62 func StreamLength(ctx *model.Context, sd *types.StreamDict) (int64, error) { 63 if val := sd.Int64Entry("Length"); val != nil { 64 return *val, nil 65 } 66 67 indRef := sd.IndirectRefEntry("Length") 68 if indRef == nil { 69 return 0, nil 70 } 71 72 i, err := ctx.DereferenceInteger(*indRef) 73 if err != nil || i == nil { 74 return 0, err 75 } 76 77 return int64(*i), nil 78 } 79 80 // ColorSpaceString returns a string representation for sd's colorspace. 81 func ColorSpaceString(ctx *model.Context, sd *types.StreamDict) (string, error) { 82 o, found := sd.Find("ColorSpace") 83 if !found { 84 return "", nil 85 } 86 87 o, err := ctx.Dereference(o) 88 if err != nil { 89 return "", err 90 } 91 92 switch cs := o.(type) { 93 94 case types.Name: 95 return string(cs), nil 96 97 case types.Array: 98 return string(cs[0].(types.Name)), nil 99 } 100 101 return "", nil 102 } 103 104 func colorSpaceNameComponents(cs types.Name) int { 105 switch cs { 106 107 case model.DeviceGrayCS: 108 return 1 109 110 case model.DeviceRGBCS: 111 return 3 112 113 case model.DeviceCMYKCS: 114 return 4 115 } 116 117 return 0 118 } 119 120 func indexedColorSpaceComponents(xRefTable *model.XRefTable, cs types.Array) (int, error) { 121 baseCS, err := xRefTable.Dereference(cs[1]) 122 if err != nil { 123 return 0, err 124 } 125 126 switch cs := baseCS.(type) { 127 case types.Name: 128 return colorSpaceNameComponents(cs), nil 129 130 case types.Array: 131 switch cs[0].(types.Name) { 132 133 case model.CalGrayCS: 134 return 1, nil 135 136 case model.CalRGBCS: 137 return 3, nil 138 139 case model.LabCS: 140 return 3, nil 141 142 case model.ICCBasedCS: 143 iccProfileStream, _, err := xRefTable.DereferenceStreamDict(cs[1]) 144 if err != nil { 145 return 0, err 146 } 147 n := iccProfileStream.IntEntry("N") 148 i := 0 149 if n != nil { 150 i = *n 151 } 152 return i, nil 153 154 case model.SeparationCS: 155 return 1, nil 156 157 case model.DeviceNCS: 158 return len(cs[1].(types.Array)), nil 159 } 160 } 161 162 return 0, nil 163 } 164 165 // ColorSpaceComponents returns the corresponding number of used color components for sd's colorspace. 166 func ColorSpaceComponents(xRefTable *model.XRefTable, sd *types.StreamDict) (int, error) { 167 o, found := sd.Find("ColorSpace") 168 if !found { 169 return 0, nil 170 } 171 172 o, err := xRefTable.Dereference(o) 173 if err != nil { 174 return 0, err 175 } 176 177 switch cs := o.(type) { 178 case types.Name: 179 return colorSpaceNameComponents(cs), nil 180 181 case types.Array: 182 switch cs[0].(types.Name) { 183 184 case model.CalGrayCS: 185 return 1, nil 186 187 case model.CalRGBCS: 188 return 3, nil 189 190 case model.LabCS: 191 return 3, nil 192 193 case model.ICCBasedCS: 194 iccProfileStream, _, err := xRefTable.DereferenceStreamDict(cs[1]) 195 if err != nil { 196 return 0, err 197 } 198 n := iccProfileStream.IntEntry("N") 199 i := 0 200 if n != nil { 201 i = *n 202 } 203 return i, nil 204 205 case model.SeparationCS: 206 return 1, nil 207 208 case model.DeviceNCS: 209 return len(cs[1].(types.Array)), nil 210 211 case model.IndexedCS: 212 return indexedColorSpaceComponents(xRefTable, cs) 213 214 } 215 } 216 217 return 0, nil 218 } 219 220 func imageWidth(ctx *model.Context, sd *types.StreamDict, objNr int) (int, error) { 221 obj, ok := sd.Find("Width") 222 if !ok { 223 return 0, errors.Errorf("pdfcpu: missing image width obj#%d", objNr) 224 } 225 i, err := ctx.DereferenceInteger(obj) 226 if err != nil { 227 return 0, err 228 } 229 return i.Value(), nil 230 } 231 232 func imageHeight(ctx *model.Context, sd *types.StreamDict, objNr int) (int, error) { 233 obj, ok := sd.Find("Height") 234 if !ok { 235 return 0, errors.Errorf("pdfcpu: missing image height obj#%d", objNr) 236 } 237 i, err := ctx.DereferenceInteger(obj) 238 if err != nil { 239 return 0, err 240 } 241 return i.Value(), nil 242 } 243 244 func imageStub( 245 ctx *model.Context, 246 sd *types.StreamDict, 247 resourceId, filters, lastFilter string, 248 decodeParms types.Dict, 249 thumb, imgMask bool, 250 objNr int) (*model.Image, error) { 251 252 w, err := imageWidth(ctx, sd, objNr) 253 if err != nil { 254 return nil, err 255 } 256 257 h, err := imageHeight(ctx, sd, objNr) 258 if err != nil { 259 return nil, err 260 } 261 262 cs, err := ColorSpaceString(ctx, sd) 263 if err != nil { 264 return nil, err 265 } 266 267 comp, err := ColorSpaceComponents(ctx.XRefTable, sd) 268 if err != nil { 269 return nil, err 270 } 271 if lastFilter == filter.CCITTFax { 272 comp = 1 273 } 274 275 bpc := 0 276 if i := sd.IntEntry("BitsPerComponent"); i != nil { 277 bpc = *i 278 } 279 // if jpx, bpc is undefined 280 if imgMask { 281 bpc = 1 282 } 283 284 var sMask bool 285 if sm, _ := sd.Find("SMask"); sm != nil { 286 sMask = true 287 } 288 289 var mask bool 290 if sm, _ := sd.Find("Mask"); sm != nil { 291 mask = true 292 } 293 294 var interpol bool 295 if b := sd.BooleanEntry("Interpolate"); b != nil && *b { 296 interpol = true 297 } 298 299 size, err := StreamLength(ctx, sd) 300 if err != nil { 301 return nil, err 302 } 303 304 var s string 305 if decodeParms != nil { 306 s = decodeParms.String() 307 } 308 309 img := &model.Image{ 310 ObjNr: objNr, 311 Name: resourceId, 312 Thumb: thumb, 313 IsImgMask: imgMask, 314 HasImgMask: mask, 315 HasSMask: sMask, 316 Width: w, 317 Height: h, 318 Cs: cs, 319 Comp: comp, 320 Bpc: bpc, 321 Interpol: interpol, 322 Size: size, 323 Filter: filters, 324 DecodeParms: s, 325 } 326 327 return img, nil 328 } 329 330 func prepareExtractImage(sd *types.StreamDict) (string, string, types.Dict, bool) { 331 var imgMask bool 332 if im := sd.BooleanEntry("ImageMask"); im != nil && *im { 333 imgMask = true 334 } 335 336 var ( 337 filters string 338 lastFilter string 339 d types.Dict 340 ) 341 342 fpl := sd.FilterPipeline 343 if fpl != nil { 344 var s []string 345 for _, filter := range fpl { 346 s = append(s, filter.Name) 347 lastFilter = filter.Name 348 if filter.DecodeParms != nil { 349 d = filter.DecodeParms 350 } 351 } 352 filters = strings.Join(s, ",") 353 } 354 355 return filters, lastFilter, d, imgMask 356 } 357 func decodeImage(ctx *model.Context, sd *types.StreamDict, filters, lastFilter string, objNr int) error { 358 // CCITTDecoded images / (bit) masks don't have a ColorSpace attribute, but we render image files. 359 if lastFilter == filter.CCITTFax { 360 if _, err := ctx.DereferenceDictEntry(sd.Dict, "ColorSpace"); err != nil { 361 sd.InsertName("ColorSpace", model.DeviceGrayCS) 362 } 363 } 364 365 if lastFilter == filter.DCT { 366 comp, err := ColorSpaceComponents(ctx.XRefTable, sd) 367 if err != nil { 368 return err 369 } 370 sd.CSComponents = comp 371 } 372 373 switch lastFilter { 374 375 case filter.DCT, filter.JPX, filter.Flate, filter.LZW, filter.CCITTFax, filter.RunLength: 376 if err := sd.Decode(); err != nil { 377 return err 378 } 379 380 default: 381 msg := fmt.Sprintf("pdfcpu: ExtractImage(obj#%d): skipping img, filter %s unsupported", objNr, filters) 382 if log.DebugEnabled() { 383 log.Debug.Println(msg) 384 } 385 if log.CLIEnabled() { 386 log.CLI.Println(msg) 387 } 388 return nil 389 } 390 391 return nil 392 } 393 394 func img( 395 ctx *model.Context, 396 sd *types.StreamDict, 397 thumb bool, 398 resourceID, filters, lastFilter string, 399 objNr int) (*model.Image, error) { 400 401 if sd.FilterPipeline == nil { 402 sd.Content = sd.Raw 403 } else { 404 if err := decodeImage(ctx, sd, filters, lastFilter, objNr); err != nil { 405 return nil, err 406 } 407 } 408 409 r, t, err := RenderImage(ctx.XRefTable, sd, thumb, resourceID, objNr) 410 if err != nil { 411 return nil, err 412 } 413 414 img := &model.Image{ 415 Reader: r, 416 Name: resourceID, 417 ObjNr: objNr, 418 Thumb: thumb, 419 FileType: t, 420 } 421 422 return img, nil 423 } 424 425 // ExtractImage extracts an image from sd. 426 func ExtractImage(ctx *model.Context, sd *types.StreamDict, thumb bool, resourceID string, objNr int, stub bool) (*model.Image, error) { 427 if sd == nil { 428 return nil, nil 429 } 430 431 filters, lastFilter, decodeParms, imgMask := prepareExtractImage(sd) 432 433 if stub { 434 return imageStub(ctx, sd, resourceID, filters, lastFilter, decodeParms, thumb, imgMask, objNr) 435 } 436 437 return img(ctx, sd, thumb, resourceID, filters, lastFilter, objNr) 438 } 439 440 // ExtractPageImages extracts all images used by pageNr. 441 // Optionally return stubs only. 442 func ExtractPageImages(ctx *model.Context, pageNr int, stub bool) (map[int]model.Image, error) { 443 m := map[int]model.Image{} 444 for _, objNr := range ImageObjNrs(ctx, pageNr) { 445 imageObj := ctx.Optimize.ImageObjects[objNr] 446 img, err := ExtractImage(ctx, imageObj.ImageDict, false, imageObj.ResourceNames[pageNr-1], objNr, stub) 447 if err != nil { 448 return nil, err 449 } 450 if img != nil { 451 img.PageNr = pageNr 452 m[objNr] = *img 453 } 454 } 455 // Extract thumbnail for pageNr 456 if indRef, ok := ctx.PageThumbs[pageNr]; ok { 457 objNr := indRef.ObjectNumber.Value() 458 sd, _, err := ctx.DereferenceStreamDict(indRef) 459 if err != nil { 460 return nil, err 461 } 462 img, err := ExtractImage(ctx, sd, true, "", objNr, stub) 463 if err != nil { 464 return nil, err 465 } 466 if img != nil { 467 img.PageNr = pageNr 468 m[objNr] = *img 469 } 470 } 471 return m, nil 472 } 473 474 // Font is a Reader representing an embedded font. 475 type Font struct { 476 io.Reader 477 Name string 478 Type string 479 } 480 481 // FontObjNrs returns all font dict objNrs for pageNr. 482 // Requires an optimized context. 483 func FontObjNrs(ctx *model.Context, pageNr int) []int { 484 objNrs := []int{} 485 486 if pageNr < 1 { 487 return objNrs 488 } 489 490 fontObjNrs := ctx.Optimize.PageFonts 491 if len(fontObjNrs) == 0 { 492 return objNrs 493 } 494 495 pageFontObjNrs := fontObjNrs[pageNr-1] 496 if pageFontObjNrs == nil { 497 return objNrs 498 } 499 500 for k, v := range pageFontObjNrs { 501 if v { 502 objNrs = append(objNrs, k) 503 } 504 } 505 return objNrs 506 } 507 508 // ExtractFont extracts a font from fontObject. 509 func ExtractFont(ctx *model.Context, fontObject model.FontObject, objNr int) (*Font, error) { 510 d, err := font.FontDescriptor(ctx.XRefTable, fontObject.FontDict, objNr) 511 if err != nil { 512 return nil, err 513 } 514 515 if d == nil { 516 if log.DebugEnabled() { 517 log.Debug.Printf("ExtractFont: ignoring obj#%d - no fontDescriptor available for font: %s\n", objNr, fontObject.FontName) 518 } 519 return nil, nil 520 } 521 522 ir := fontDescriptorFontFileIndirectObjectRef(d) 523 if ir == nil { 524 if log.DebugEnabled() { 525 log.Debug.Printf("ExtractFont: ignoring obj#%d - no font file available for font: %s\n", objNr, fontObject.FontName) 526 } 527 return nil, nil 528 } 529 530 var f *Font 531 532 fontType := fontObject.SubType() 533 534 switch fontType { 535 536 case "TrueType": 537 // ttf ... true type file 538 // ttc ... true type collection 539 sd, _, err := ctx.DereferenceStreamDict(*ir) 540 if err != nil { 541 return nil, err 542 } 543 if sd == nil { 544 return nil, errors.Errorf("extractFontData: corrupt font obj#%d for font: %s\n", objNr, fontObject.FontName) 545 } 546 547 // Decode streamDict if used filter is supported only. 548 err = sd.Decode() 549 if err == filter.ErrUnsupportedFilter { 550 return nil, nil 551 } 552 if err != nil { 553 return nil, err 554 } 555 556 f = &Font{bytes.NewReader(sd.Content), fontObject.FontName, "ttf"} 557 558 default: 559 s := fmt.Sprintf("extractFontData: obj#%d - unsupported fonttype %s - font: %s\n", objNr, fontType, fontObject.FontName) 560 if log.InfoEnabled() { 561 log.Info.Println(s) 562 } 563 if log.CLIEnabled() { 564 log.CLI.Printf(s) 565 } 566 return nil, nil 567 } 568 569 return f, nil 570 } 571 572 // ExtractPageFonts extracts all fonts used by pageNr. 573 func ExtractPageFonts(ctx *model.Context, pageNr int, objNrs, skipped types.IntSet) ([]Font, error) { 574 ff := []Font{} 575 for _, i := range FontObjNrs(ctx, pageNr) { 576 if objNrs[i] || skipped[i] { 577 continue 578 } 579 fontObject := ctx.Optimize.FontObjects[i] 580 f, err := ExtractFont(ctx, *fontObject, i) 581 if err != nil { 582 return nil, err 583 } 584 if f != nil { 585 ff = append(ff, *f) 586 objNrs[i] = true 587 } else { 588 skipped[i] = true 589 } 590 } 591 return ff, nil 592 } 593 594 // ExtractPageFonts extracts all form fonts. 595 func ExtractFormFonts(ctx *model.Context) ([]Font, error) { 596 ff := []Font{} 597 for i, fontObject := range ctx.Optimize.FormFontObjects { 598 f, err := ExtractFont(ctx, *fontObject, i) 599 if err != nil { 600 return nil, err 601 } 602 if f != nil { 603 ff = append(ff, *f) 604 } 605 } 606 return ff, nil 607 } 608 609 // ExtractPages extracts pageNrs into a new single page context. 610 func ExtractPages(ctx *model.Context, pageNrs []int, usePgCache bool) (*model.Context, error) { 611 ctxDest, err := CreateContextWithXRefTable(ctx.Conf, types.PaperSize["A4"]) 612 if err != nil { 613 return nil, err 614 } 615 616 if err := AddPages(ctx, ctxDest, pageNrs, usePgCache); err != nil { 617 return nil, err 618 } 619 620 return ctxDest, nil 621 } 622 623 // ExtractPageContent extracts the consolidated page content stream for pageNr. 624 func ExtractPageContent(ctx *model.Context, pageNr int) (io.Reader, error) { 625 consolidateRes := false 626 d, _, _, err := ctx.PageDict(pageNr, consolidateRes) 627 if err != nil { 628 return nil, err 629 } 630 bb, err := ctx.PageContent(d, pageNr) 631 if err != nil && err != model.ErrNoContent { 632 return nil, err 633 } 634 return bytes.NewReader(bb), nil 635 } 636 637 // Metadata is a Reader representing a metadata dict. 638 type Metadata struct { 639 io.Reader // metadata 640 ObjNr int // metadata dict objNr 641 ParentObjNr int // container object number 642 ParentType string // container dict type 643 } 644 645 func extractMetadataFromDict(ctx *model.Context, d types.Dict, parentObjNr int) (*Metadata, error) { 646 o, found := d.Find("Metadata") 647 if !found || o == nil { 648 return nil, nil 649 } 650 sd, _, err := ctx.DereferenceStreamDict(o) 651 if err != nil { 652 return nil, err 653 } 654 if sd == nil { 655 return nil, nil 656 } 657 // Get metadata dict object number. 658 ir, _ := o.(types.IndirectRef) 659 mdObjNr := ir.ObjectNumber.Value() 660 // Get container dict type. 661 dt := "unknown" 662 if d.Type() != nil { 663 dt = *d.Type() 664 } 665 // Decode streamDict for supported filters only. 666 if err = sd.Decode(); err == filter.ErrUnsupportedFilter { 667 return nil, nil 668 } 669 if err != nil { 670 return nil, err 671 } 672 return &Metadata{bytes.NewReader(sd.Content), mdObjNr, parentObjNr, dt}, nil 673 } 674 675 // ExtractMetadata returns all metadata of ctx. 676 func ExtractMetadata(ctx *model.Context) ([]Metadata, error) { 677 mm := []Metadata{} 678 for k, v := range ctx.Table { 679 if v.Free || v.Compressed { 680 continue 681 } 682 switch d := v.Object.(type) { 683 case types.Dict: 684 md, err := extractMetadataFromDict(ctx, d, k) 685 if err != nil { 686 return nil, err 687 } 688 if md == nil { 689 continue 690 } 691 mm = append(mm, *md) 692 693 case types.StreamDict: 694 md, err := extractMetadataFromDict(ctx, d.Dict, k) 695 if err != nil { 696 return nil, err 697 } 698 if md == nil { 699 continue 700 } 701 mm = append(mm, *md) 702 } 703 } 704 return mm, nil 705 }