github.com/pdfcpu/pdfcpu@v0.11.1/pkg/pdfcpu/extract.go (about)

     1  /*
     2  Copyright 2018 The pdfcpu Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package pdfcpu
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"io"
    23  	"strings"
    24  
    25  	"github.com/pdfcpu/pdfcpu/pkg/filter"
    26  	"github.com/pdfcpu/pdfcpu/pkg/log"
    27  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/font"
    28  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
    29  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
    30  	"github.com/pkg/errors"
    31  )
    32  
    33  // ImageObjNrs returns all image dict objNrs for pageNr.
    34  // Requires an optimized context.
    35  func ImageObjNrs(ctx *model.Context, pageNr int) []int {
    36  	// TODO Exclude SMask image objects.
    37  	objNrs := []int{}
    38  
    39  	if pageNr < 1 {
    40  		return objNrs
    41  	}
    42  
    43  	imgObjNrs := ctx.Optimize.PageImages
    44  	if len(imgObjNrs) == 0 {
    45  		return objNrs
    46  	}
    47  
    48  	pageImgObjNrs := imgObjNrs[pageNr-1]
    49  	if pageImgObjNrs == nil {
    50  		return objNrs
    51  	}
    52  
    53  	for k, v := range pageImgObjNrs {
    54  		if v {
    55  			objNrs = append(objNrs, k)
    56  		}
    57  	}
    58  	return objNrs
    59  }
    60  
    61  // StreamLength returns sd's stream length.
    62  func StreamLength(ctx *model.Context, sd *types.StreamDict) (int64, error) {
    63  	if val := sd.Int64Entry("Length"); val != nil {
    64  		return *val, nil
    65  	}
    66  
    67  	indRef := sd.IndirectRefEntry("Length")
    68  	if indRef == nil {
    69  		return 0, nil
    70  	}
    71  
    72  	i, err := ctx.DereferenceInteger(*indRef)
    73  	if err != nil || i == nil {
    74  		return 0, err
    75  	}
    76  
    77  	return int64(*i), nil
    78  }
    79  
    80  // ColorSpaceString returns a string representation for sd's colorspace.
    81  func ColorSpaceString(ctx *model.Context, sd *types.StreamDict) (string, error) {
    82  	o, found := sd.Find("ColorSpace")
    83  	if !found {
    84  		return "", nil
    85  	}
    86  
    87  	o, err := ctx.Dereference(o)
    88  	if err != nil {
    89  		return "", err
    90  	}
    91  
    92  	switch cs := o.(type) {
    93  
    94  	case types.Name:
    95  		return string(cs), nil
    96  
    97  	case types.Array:
    98  		return string(cs[0].(types.Name)), nil
    99  	}
   100  
   101  	return "", nil
   102  }
   103  
   104  func colorSpaceNameComponents(cs types.Name) int {
   105  	switch cs {
   106  
   107  	case model.DeviceGrayCS:
   108  		return 1
   109  
   110  	case model.DeviceRGBCS:
   111  		return 3
   112  
   113  	case model.DeviceCMYKCS:
   114  		return 4
   115  	}
   116  
   117  	return 0
   118  }
   119  
   120  func indexedColorSpaceComponents(xRefTable *model.XRefTable, cs types.Array) (int, error) {
   121  	baseCS, err := xRefTable.Dereference(cs[1])
   122  	if err != nil {
   123  		return 0, err
   124  	}
   125  
   126  	switch cs := baseCS.(type) {
   127  	case types.Name:
   128  		return colorSpaceNameComponents(cs), nil
   129  
   130  	case types.Array:
   131  		switch cs[0].(types.Name) {
   132  
   133  		case model.CalGrayCS:
   134  			return 1, nil
   135  
   136  		case model.CalRGBCS:
   137  			return 3, nil
   138  
   139  		case model.LabCS:
   140  			return 3, nil
   141  
   142  		case model.ICCBasedCS:
   143  			iccProfileStream, _, err := xRefTable.DereferenceStreamDict(cs[1])
   144  			if err != nil {
   145  				return 0, err
   146  			}
   147  			n := iccProfileStream.IntEntry("N")
   148  			i := 0
   149  			if n != nil {
   150  				i = *n
   151  			}
   152  			return i, nil
   153  
   154  		case model.SeparationCS:
   155  			return 1, nil
   156  
   157  		case model.DeviceNCS:
   158  			return len(cs[1].(types.Array)), nil
   159  		}
   160  	}
   161  
   162  	return 0, nil
   163  }
   164  
   165  // ColorSpaceComponents returns the corresponding number of used color components for sd's colorspace.
   166  func ColorSpaceComponents(xRefTable *model.XRefTable, sd *types.StreamDict) (int, error) {
   167  	o, found := sd.Find("ColorSpace")
   168  	if !found {
   169  		return 0, nil
   170  	}
   171  
   172  	o, err := xRefTable.Dereference(o)
   173  	if err != nil {
   174  		return 0, err
   175  	}
   176  
   177  	switch cs := o.(type) {
   178  	case types.Name:
   179  		return colorSpaceNameComponents(cs), nil
   180  
   181  	case types.Array:
   182  		switch cs[0].(types.Name) {
   183  
   184  		case model.CalGrayCS:
   185  			return 1, nil
   186  
   187  		case model.CalRGBCS:
   188  			return 3, nil
   189  
   190  		case model.LabCS:
   191  			return 3, nil
   192  
   193  		case model.ICCBasedCS:
   194  			iccProfileStream, _, err := xRefTable.DereferenceStreamDict(cs[1])
   195  			if err != nil {
   196  				return 0, err
   197  			}
   198  			n := iccProfileStream.IntEntry("N")
   199  			i := 0
   200  			if n != nil {
   201  				i = *n
   202  			}
   203  			return i, nil
   204  
   205  		case model.SeparationCS:
   206  			return 1, nil
   207  
   208  		case model.DeviceNCS:
   209  			return len(cs[1].(types.Array)), nil
   210  
   211  		case model.IndexedCS:
   212  			return indexedColorSpaceComponents(xRefTable, cs)
   213  
   214  		}
   215  	}
   216  
   217  	return 0, nil
   218  }
   219  
   220  func imageWidth(ctx *model.Context, sd *types.StreamDict, objNr int) (int, error) {
   221  	obj, ok := sd.Find("Width")
   222  	if !ok {
   223  		return 0, errors.Errorf("pdfcpu: missing image width obj#%d", objNr)
   224  	}
   225  	i, err := ctx.DereferenceInteger(obj)
   226  	if err != nil {
   227  		return 0, err
   228  	}
   229  	return i.Value(), nil
   230  }
   231  
   232  func imageHeight(ctx *model.Context, sd *types.StreamDict, objNr int) (int, error) {
   233  	obj, ok := sd.Find("Height")
   234  	if !ok {
   235  		return 0, errors.Errorf("pdfcpu: missing image height obj#%d", objNr)
   236  	}
   237  	i, err := ctx.DereferenceInteger(obj)
   238  	if err != nil {
   239  		return 0, err
   240  	}
   241  	return i.Value(), nil
   242  }
   243  
   244  func imageStub(
   245  	ctx *model.Context,
   246  	sd *types.StreamDict,
   247  	resourceId, filters, lastFilter string,
   248  	decodeParms types.Dict,
   249  	thumb, imgMask bool,
   250  	objNr int) (*model.Image, error) {
   251  
   252  	w, err := imageWidth(ctx, sd, objNr)
   253  	if err != nil {
   254  		return nil, err
   255  	}
   256  
   257  	h, err := imageHeight(ctx, sd, objNr)
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  
   262  	cs, err := ColorSpaceString(ctx, sd)
   263  	if err != nil {
   264  		return nil, err
   265  	}
   266  
   267  	comp, err := ColorSpaceComponents(ctx.XRefTable, sd)
   268  	if err != nil {
   269  		return nil, err
   270  	}
   271  	if lastFilter == filter.CCITTFax {
   272  		comp = 1
   273  	}
   274  
   275  	bpc := 0
   276  	if i := sd.IntEntry("BitsPerComponent"); i != nil {
   277  		bpc = *i
   278  	}
   279  	// if jpx, bpc is undefined
   280  	if imgMask {
   281  		bpc = 1
   282  	}
   283  
   284  	var sMask bool
   285  	if sm, _ := sd.Find("SMask"); sm != nil {
   286  		sMask = true
   287  	}
   288  
   289  	var mask bool
   290  	if sm, _ := sd.Find("Mask"); sm != nil {
   291  		mask = true
   292  	}
   293  
   294  	var interpol bool
   295  	if b := sd.BooleanEntry("Interpolate"); b != nil && *b {
   296  		interpol = true
   297  	}
   298  
   299  	size, err := StreamLength(ctx, sd)
   300  	if err != nil {
   301  		return nil, err
   302  	}
   303  
   304  	var s string
   305  	if decodeParms != nil {
   306  		s = decodeParms.String()
   307  	}
   308  
   309  	img := &model.Image{
   310  		ObjNr:       objNr,
   311  		Name:        resourceId,
   312  		Thumb:       thumb,
   313  		IsImgMask:   imgMask,
   314  		HasImgMask:  mask,
   315  		HasSMask:    sMask,
   316  		Width:       w,
   317  		Height:      h,
   318  		Cs:          cs,
   319  		Comp:        comp,
   320  		Bpc:         bpc,
   321  		Interpol:    interpol,
   322  		Size:        size,
   323  		Filter:      filters,
   324  		DecodeParms: s,
   325  	}
   326  
   327  	return img, nil
   328  }
   329  
   330  func prepareExtractImage(sd *types.StreamDict) (string, string, types.Dict, bool) {
   331  	var imgMask bool
   332  	if im := sd.BooleanEntry("ImageMask"); im != nil && *im {
   333  		imgMask = true
   334  	}
   335  
   336  	var (
   337  		filters    string
   338  		lastFilter string
   339  		d          types.Dict
   340  	)
   341  
   342  	fpl := sd.FilterPipeline
   343  	if fpl != nil {
   344  		var s []string
   345  		for _, filter := range fpl {
   346  			s = append(s, filter.Name)
   347  			lastFilter = filter.Name
   348  			if filter.DecodeParms != nil {
   349  				d = filter.DecodeParms
   350  			}
   351  		}
   352  		filters = strings.Join(s, ",")
   353  	}
   354  
   355  	return filters, lastFilter, d, imgMask
   356  }
   357  func decodeImage(ctx *model.Context, sd *types.StreamDict, filters, lastFilter string, objNr int) error {
   358  	// CCITTDecoded images / (bit) masks don't have a ColorSpace attribute, but we render image files.
   359  	if lastFilter == filter.CCITTFax {
   360  		if _, err := ctx.DereferenceDictEntry(sd.Dict, "ColorSpace"); err != nil {
   361  			sd.InsertName("ColorSpace", model.DeviceGrayCS)
   362  		}
   363  	}
   364  
   365  	if lastFilter == filter.DCT {
   366  		comp, err := ColorSpaceComponents(ctx.XRefTable, sd)
   367  		if err != nil {
   368  			return err
   369  		}
   370  		sd.CSComponents = comp
   371  	}
   372  
   373  	switch lastFilter {
   374  
   375  	case filter.DCT, filter.JPX, filter.Flate, filter.LZW, filter.CCITTFax, filter.RunLength:
   376  		if err := sd.Decode(); err != nil {
   377  			return err
   378  		}
   379  
   380  	default:
   381  		msg := fmt.Sprintf("pdfcpu: ExtractImage(obj#%d): skipping img, filter %s unsupported", objNr, filters)
   382  		if log.DebugEnabled() {
   383  			log.Debug.Println(msg)
   384  		}
   385  		if log.CLIEnabled() {
   386  			log.CLI.Println(msg)
   387  		}
   388  		return nil
   389  	}
   390  
   391  	return nil
   392  }
   393  
   394  func img(
   395  	ctx *model.Context,
   396  	sd *types.StreamDict,
   397  	thumb bool,
   398  	resourceID, filters, lastFilter string,
   399  	objNr int) (*model.Image, error) {
   400  
   401  	if sd.FilterPipeline == nil {
   402  		sd.Content = sd.Raw
   403  	} else {
   404  		if err := decodeImage(ctx, sd, filters, lastFilter, objNr); err != nil {
   405  			return nil, err
   406  		}
   407  	}
   408  
   409  	r, t, err := RenderImage(ctx.XRefTable, sd, thumb, resourceID, objNr)
   410  	if err != nil {
   411  		return nil, err
   412  	}
   413  
   414  	img := &model.Image{
   415  		Reader:   r,
   416  		Name:     resourceID,
   417  		ObjNr:    objNr,
   418  		Thumb:    thumb,
   419  		FileType: t,
   420  	}
   421  
   422  	return img, nil
   423  }
   424  
   425  // ExtractImage extracts an image from sd.
   426  func ExtractImage(ctx *model.Context, sd *types.StreamDict, thumb bool, resourceID string, objNr int, stub bool) (*model.Image, error) {
   427  	if sd == nil {
   428  		return nil, nil
   429  	}
   430  
   431  	filters, lastFilter, decodeParms, imgMask := prepareExtractImage(sd)
   432  
   433  	if stub {
   434  		return imageStub(ctx, sd, resourceID, filters, lastFilter, decodeParms, thumb, imgMask, objNr)
   435  	}
   436  
   437  	return img(ctx, sd, thumb, resourceID, filters, lastFilter, objNr)
   438  }
   439  
   440  // ExtractPageImages extracts all images used by pageNr.
   441  // Optionally return stubs only.
   442  func ExtractPageImages(ctx *model.Context, pageNr int, stub bool) (map[int]model.Image, error) {
   443  	m := map[int]model.Image{}
   444  	for _, objNr := range ImageObjNrs(ctx, pageNr) {
   445  		imageObj := ctx.Optimize.ImageObjects[objNr]
   446  		img, err := ExtractImage(ctx, imageObj.ImageDict, false, imageObj.ResourceNames[pageNr-1], objNr, stub)
   447  		if err != nil {
   448  			return nil, err
   449  		}
   450  		if img != nil {
   451  			img.PageNr = pageNr
   452  			m[objNr] = *img
   453  		}
   454  	}
   455  	// Extract thumbnail for pageNr
   456  	if indRef, ok := ctx.PageThumbs[pageNr]; ok {
   457  		objNr := indRef.ObjectNumber.Value()
   458  		sd, _, err := ctx.DereferenceStreamDict(indRef)
   459  		if err != nil {
   460  			return nil, err
   461  		}
   462  		img, err := ExtractImage(ctx, sd, true, "", objNr, stub)
   463  		if err != nil {
   464  			return nil, err
   465  		}
   466  		if img != nil {
   467  			img.PageNr = pageNr
   468  			m[objNr] = *img
   469  		}
   470  	}
   471  	return m, nil
   472  }
   473  
   474  // Font is a Reader representing an embedded font.
   475  type Font struct {
   476  	io.Reader
   477  	Name string
   478  	Type string
   479  }
   480  
   481  // FontObjNrs returns all font dict objNrs for pageNr.
   482  // Requires an optimized context.
   483  func FontObjNrs(ctx *model.Context, pageNr int) []int {
   484  	objNrs := []int{}
   485  
   486  	if pageNr < 1 {
   487  		return objNrs
   488  	}
   489  
   490  	fontObjNrs := ctx.Optimize.PageFonts
   491  	if len(fontObjNrs) == 0 {
   492  		return objNrs
   493  	}
   494  
   495  	pageFontObjNrs := fontObjNrs[pageNr-1]
   496  	if pageFontObjNrs == nil {
   497  		return objNrs
   498  	}
   499  
   500  	for k, v := range pageFontObjNrs {
   501  		if v {
   502  			objNrs = append(objNrs, k)
   503  		}
   504  	}
   505  	return objNrs
   506  }
   507  
   508  // ExtractFont extracts a font from fontObject.
   509  func ExtractFont(ctx *model.Context, fontObject model.FontObject, objNr int) (*Font, error) {
   510  	d, err := font.FontDescriptor(ctx.XRefTable, fontObject.FontDict, objNr)
   511  	if err != nil {
   512  		return nil, err
   513  	}
   514  
   515  	if d == nil {
   516  		if log.DebugEnabled() {
   517  			log.Debug.Printf("ExtractFont: ignoring obj#%d - no fontDescriptor available for font: %s\n", objNr, fontObject.FontName)
   518  		}
   519  		return nil, nil
   520  	}
   521  
   522  	ir := fontDescriptorFontFileIndirectObjectRef(d)
   523  	if ir == nil {
   524  		if log.DebugEnabled() {
   525  			log.Debug.Printf("ExtractFont: ignoring obj#%d - no font file available for font: %s\n", objNr, fontObject.FontName)
   526  		}
   527  		return nil, nil
   528  	}
   529  
   530  	var f *Font
   531  
   532  	fontType := fontObject.SubType()
   533  
   534  	switch fontType {
   535  
   536  	case "TrueType":
   537  		// ttf ... true type file
   538  		// ttc ... true type collection
   539  		sd, _, err := ctx.DereferenceStreamDict(*ir)
   540  		if err != nil {
   541  			return nil, err
   542  		}
   543  		if sd == nil {
   544  			return nil, errors.Errorf("extractFontData: corrupt font obj#%d for font: %s\n", objNr, fontObject.FontName)
   545  		}
   546  
   547  		// Decode streamDict if used filter is supported only.
   548  		err = sd.Decode()
   549  		if err == filter.ErrUnsupportedFilter {
   550  			return nil, nil
   551  		}
   552  		if err != nil {
   553  			return nil, err
   554  		}
   555  
   556  		f = &Font{bytes.NewReader(sd.Content), fontObject.FontName, "ttf"}
   557  
   558  	default:
   559  		s := fmt.Sprintf("extractFontData: obj#%d - unsupported fonttype %s -  font: %s\n", objNr, fontType, fontObject.FontName)
   560  		if log.InfoEnabled() {
   561  			log.Info.Println(s)
   562  		}
   563  		if log.CLIEnabled() {
   564  			log.CLI.Printf(s)
   565  		}
   566  		return nil, nil
   567  	}
   568  
   569  	return f, nil
   570  }
   571  
   572  // ExtractPageFonts extracts all fonts used by pageNr.
   573  func ExtractPageFonts(ctx *model.Context, pageNr int, objNrs, skipped types.IntSet) ([]Font, error) {
   574  	ff := []Font{}
   575  	for _, i := range FontObjNrs(ctx, pageNr) {
   576  		if objNrs[i] || skipped[i] {
   577  			continue
   578  		}
   579  		fontObject := ctx.Optimize.FontObjects[i]
   580  		f, err := ExtractFont(ctx, *fontObject, i)
   581  		if err != nil {
   582  			return nil, err
   583  		}
   584  		if f != nil {
   585  			ff = append(ff, *f)
   586  			objNrs[i] = true
   587  		} else {
   588  			skipped[i] = true
   589  		}
   590  	}
   591  	return ff, nil
   592  }
   593  
   594  // ExtractPageFonts extracts all form fonts.
   595  func ExtractFormFonts(ctx *model.Context) ([]Font, error) {
   596  	ff := []Font{}
   597  	for i, fontObject := range ctx.Optimize.FormFontObjects {
   598  		f, err := ExtractFont(ctx, *fontObject, i)
   599  		if err != nil {
   600  			return nil, err
   601  		}
   602  		if f != nil {
   603  			ff = append(ff, *f)
   604  		}
   605  	}
   606  	return ff, nil
   607  }
   608  
   609  // ExtractPages extracts pageNrs into a new single page context.
   610  func ExtractPages(ctx *model.Context, pageNrs []int, usePgCache bool) (*model.Context, error) {
   611  	ctxDest, err := CreateContextWithXRefTable(ctx.Conf, types.PaperSize["A4"])
   612  	if err != nil {
   613  		return nil, err
   614  	}
   615  
   616  	if err := AddPages(ctx, ctxDest, pageNrs, usePgCache); err != nil {
   617  		return nil, err
   618  	}
   619  
   620  	return ctxDest, nil
   621  }
   622  
   623  // ExtractPageContent extracts the consolidated page content stream for pageNr.
   624  func ExtractPageContent(ctx *model.Context, pageNr int) (io.Reader, error) {
   625  	consolidateRes := false
   626  	d, _, _, err := ctx.PageDict(pageNr, consolidateRes)
   627  	if err != nil {
   628  		return nil, err
   629  	}
   630  	bb, err := ctx.PageContent(d, pageNr)
   631  	if err != nil && err != model.ErrNoContent {
   632  		return nil, err
   633  	}
   634  	return bytes.NewReader(bb), nil
   635  }
   636  
   637  // Metadata is a Reader representing a metadata dict.
   638  type Metadata struct {
   639  	io.Reader          // metadata
   640  	ObjNr       int    // metadata dict objNr
   641  	ParentObjNr int    // container object number
   642  	ParentType  string // container dict type
   643  }
   644  
   645  func extractMetadataFromDict(ctx *model.Context, d types.Dict, parentObjNr int) (*Metadata, error) {
   646  	o, found := d.Find("Metadata")
   647  	if !found || o == nil {
   648  		return nil, nil
   649  	}
   650  	sd, _, err := ctx.DereferenceStreamDict(o)
   651  	if err != nil {
   652  		return nil, err
   653  	}
   654  	if sd == nil {
   655  		return nil, nil
   656  	}
   657  	// Get metadata dict object number.
   658  	ir, _ := o.(types.IndirectRef)
   659  	mdObjNr := ir.ObjectNumber.Value()
   660  	// Get container dict type.
   661  	dt := "unknown"
   662  	if d.Type() != nil {
   663  		dt = *d.Type()
   664  	}
   665  	// Decode streamDict for supported filters only.
   666  	if err = sd.Decode(); err == filter.ErrUnsupportedFilter {
   667  		return nil, nil
   668  	}
   669  	if err != nil {
   670  		return nil, err
   671  	}
   672  	return &Metadata{bytes.NewReader(sd.Content), mdObjNr, parentObjNr, dt}, nil
   673  }
   674  
   675  // ExtractMetadata returns all metadata of ctx.
   676  func ExtractMetadata(ctx *model.Context) ([]Metadata, error) {
   677  	mm := []Metadata{}
   678  	for k, v := range ctx.Table {
   679  		if v.Free || v.Compressed {
   680  			continue
   681  		}
   682  		switch d := v.Object.(type) {
   683  		case types.Dict:
   684  			md, err := extractMetadataFromDict(ctx, d, k)
   685  			if err != nil {
   686  				return nil, err
   687  			}
   688  			if md == nil {
   689  				continue
   690  			}
   691  			mm = append(mm, *md)
   692  
   693  		case types.StreamDict:
   694  			md, err := extractMetadataFromDict(ctx, d.Dict, k)
   695  			if err != nil {
   696  				return nil, err
   697  			}
   698  			if md == nil {
   699  				continue
   700  			}
   701  			mm = append(mm, *md)
   702  		}
   703  	}
   704  	return mm, nil
   705  }