github.com/pdfcpu/pdfcpu@v0.11.1/pkg/api/extract.go (about)

     1  /*
     2  	Copyright 2019 The pdfcpu Authors.
     3  
     4  	Licensed under the Apache License, Version 2.0 (the "License");
     5  	you may not use this file except in compliance with the License.
     6  	You may obtain a copy of the License at
     7  
     8  		http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  	Unless required by applicable law or agreed to in writing, software
    11  	distributed under the License is distributed on an "AS IS" BASIS,
    12  	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  	See the License for the specific language governing permissions and
    14  	limitations under the License.
    15  */
    16  
    17  package api
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"io"
    23  	"os"
    24  	"path/filepath"
    25  	"sort"
    26  	"strconv"
    27  	"strings"
    28  
    29  	"github.com/pdfcpu/pdfcpu/pkg/log"
    30  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu"
    31  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
    32  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
    33  	"github.com/pkg/errors"
    34  )
    35  
    36  // ExtractImagesRaw returns []pdfcpu.Image containing io.Readers for images contained in selectedPages.
    37  // Beware of memory intensive returned slice.
    38  func ExtractImagesRaw(rs io.ReadSeeker, selectedPages []string, conf *model.Configuration) ([]map[int]model.Image, error) {
    39  	if rs == nil {
    40  		return nil, errors.New("pdfcpu: ExtractImages: missing rs")
    41  	}
    42  
    43  	if conf == nil {
    44  		conf = model.NewDefaultConfiguration()
    45  	}
    46  	conf.Cmd = model.EXTRACTIMAGES
    47  
    48  	ctx, err := ReadValidateAndOptimize(rs, conf)
    49  	if err != nil {
    50  		return nil, err
    51  	}
    52  
    53  	pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true)
    54  	if err != nil {
    55  		return nil, err
    56  	}
    57  
    58  	var images []map[int]model.Image
    59  	for i, v := range pages {
    60  		if !v {
    61  			continue
    62  		}
    63  		mm, err := pdfcpu.ExtractPageImages(ctx, i, false)
    64  		if err != nil {
    65  			return nil, err
    66  		}
    67  		images = append(images, mm)
    68  	}
    69  
    70  	return images, nil
    71  }
    72  
    73  // ExtractImages extracts and digests embedded image resources from rs for selected pages.
    74  func ExtractImages(rs io.ReadSeeker, selectedPages []string, digestImage func(model.Image, bool, int) error, conf *model.Configuration) error {
    75  	if rs == nil {
    76  		return errors.New("pdfcpu: ExtractImages: missing rs")
    77  	}
    78  
    79  	if conf == nil {
    80  		conf = model.NewDefaultConfiguration()
    81  	}
    82  	conf.Cmd = model.EXTRACTIMAGES
    83  
    84  	ctx, err := ReadValidateAndOptimize(rs, conf)
    85  	if err != nil {
    86  		return err
    87  	}
    88  
    89  	pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true)
    90  	if err != nil {
    91  		return err
    92  	}
    93  
    94  	pageNrs := []int{}
    95  	for k, v := range pages {
    96  		if !v {
    97  			continue
    98  		}
    99  		pageNrs = append(pageNrs, k)
   100  	}
   101  
   102  	sort.Ints(pageNrs)
   103  	maxPageDigits := len(strconv.Itoa(pageNrs[len(pageNrs)-1]))
   104  
   105  	for _, i := range pageNrs {
   106  		mm, err := pdfcpu.ExtractPageImages(ctx, i, false)
   107  		if err != nil {
   108  			return err
   109  		}
   110  		singleImgPerPage := len(mm) == 1
   111  		for _, img := range mm {
   112  			if err := digestImage(img, singleImgPerPage, maxPageDigits); err != nil {
   113  				return err
   114  			}
   115  		}
   116  	}
   117  
   118  	return nil
   119  }
   120  
   121  // ExtractImagesFile dumps embedded image resources from inFile into outDir for selected pages.
   122  func ExtractImagesFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error {
   123  	f, err := os.Open(inFile)
   124  	if err != nil {
   125  		return err
   126  	}
   127  	defer f.Close()
   128  
   129  	if log.CLIEnabled() {
   130  		log.CLI.Printf("extracting images from %s into %s/ ...\n", inFile, outDir)
   131  	}
   132  	fileName := strings.TrimSuffix(filepath.Base(inFile), ".pdf")
   133  
   134  	return ExtractImages(f, selectedPages, pdfcpu.WriteImageToDisk(outDir, fileName), conf)
   135  }
   136  
   137  func writeFonts(ff []pdfcpu.Font, outDir, fileName string) error {
   138  	for _, f := range ff {
   139  		outFile := filepath.Join(outDir, fmt.Sprintf("%s_%s.%s", fileName, f.Name, f.Type))
   140  		logWritingTo(outFile)
   141  		w, err := os.Create(outFile)
   142  		if err != nil {
   143  			return err
   144  		}
   145  		if _, err = io.Copy(w, f); err != nil {
   146  			return err
   147  		}
   148  		if err := w.Close(); err != nil {
   149  			return err
   150  		}
   151  	}
   152  
   153  	return nil
   154  }
   155  
   156  // ExtractFonts dumps embedded fontfiles from rs into outDir for selected pages.
   157  func ExtractFonts(rs io.ReadSeeker, outDir, fileName string, selectedPages []string, conf *model.Configuration) error {
   158  	if rs == nil {
   159  		return errors.New("pdfcpu: ExtractFonts: missing rs")
   160  	}
   161  
   162  	if conf == nil {
   163  		conf = model.NewDefaultConfiguration()
   164  	}
   165  	conf.Cmd = model.EXTRACTFONTS
   166  
   167  	ctx, err := ReadValidateAndOptimize(rs, conf)
   168  	if err != nil {
   169  		return err
   170  	}
   171  
   172  	pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true)
   173  	if err != nil {
   174  		return err
   175  	}
   176  
   177  	fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf")
   178  
   179  	objNrs, skipped := types.IntSet{}, types.IntSet{}
   180  
   181  	for i, v := range pages {
   182  		if !v {
   183  			continue
   184  		}
   185  		ff, err := pdfcpu.ExtractPageFonts(ctx, i, objNrs, skipped)
   186  		if err != nil {
   187  			return err
   188  		}
   189  		if err := writeFonts(ff, outDir, fileName); err != nil {
   190  			return err
   191  		}
   192  	}
   193  
   194  	ff, err := pdfcpu.ExtractFormFonts(ctx)
   195  	if err != nil {
   196  		return err
   197  	}
   198  
   199  	return writeFonts(ff, outDir, fileName)
   200  }
   201  
   202  // ExtractFontsFile dumps embedded fontfiles from inFile into outDir for selected pages.
   203  func ExtractFontsFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error {
   204  	f, err := os.Open(inFile)
   205  	if err != nil {
   206  		return err
   207  	}
   208  	defer f.Close()
   209  
   210  	if log.CLIEnabled() {
   211  		log.CLI.Printf("extracting fonts from %s into %s/ ...\n", inFile, outDir)
   212  	}
   213  
   214  	return ExtractFonts(f, outDir, filepath.Base(inFile), selectedPages, conf)
   215  }
   216  
   217  // WritePage consumes an io.Reader containing some PDF bytes and writes to outDir/fileName.
   218  func WritePage(r io.Reader, outDir, fileName string, pageNr int) error {
   219  	outFile := filepath.Join(outDir, fmt.Sprintf("%s_page_%d.pdf", fileName, pageNr))
   220  	logWritingTo(outFile)
   221  	w, err := os.Create(outFile)
   222  	if err != nil {
   223  		return err
   224  	}
   225  	if _, err = io.Copy(w, r); err != nil {
   226  		return err
   227  	}
   228  	return w.Close()
   229  }
   230  
   231  // ExtractPage extracts the page with pageNr out of ctx into an io.Reader.
   232  func ExtractPage(ctx *model.Context, pageNr int) (io.Reader, error) {
   233  	ctxNew, err := pdfcpu.ExtractPages(ctx, []int{pageNr}, false)
   234  	if err != nil {
   235  		return nil, err
   236  	}
   237  
   238  	var b bytes.Buffer
   239  	if err := WriteContext(ctxNew, &b); err != nil {
   240  		return nil, err
   241  	}
   242  
   243  	return &b, nil
   244  }
   245  
   246  // ExtractPages generates single page PDF files from rs in outDir for selected pages.
   247  func ExtractPages(rs io.ReadSeeker, outDir, fileName string, selectedPages []string, conf *model.Configuration) error {
   248  	if rs == nil {
   249  		return errors.New("pdfcpu: ExtractPages: missing rs")
   250  	}
   251  
   252  	if conf == nil {
   253  		conf = model.NewDefaultConfiguration()
   254  	}
   255  	conf.Cmd = model.EXTRACTPAGES
   256  
   257  	ctx, err := ReadValidateAndOptimize(rs, conf)
   258  	if err != nil {
   259  		return err
   260  	}
   261  
   262  	pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true)
   263  	if err != nil {
   264  		return err
   265  	}
   266  
   267  	if len(pages) == 0 {
   268  		if log.CLIEnabled() {
   269  			log.CLI.Println("aborted: missing page numbers!")
   270  		}
   271  		return nil
   272  	}
   273  
   274  	fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf")
   275  
   276  	for _, i := range sortedPages(pages) {
   277  		r, err := ExtractPage(ctx, i)
   278  		if err != nil {
   279  			return err
   280  		}
   281  		if err := WritePage(r, outDir, fileName, i); err != nil {
   282  			return err
   283  		}
   284  	}
   285  
   286  	return nil
   287  }
   288  
   289  // ExtractPagesFile generates single page PDF files from inFile in outDir for selected pages.
   290  func ExtractPagesFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error {
   291  	f, err := os.Open(inFile)
   292  	if err != nil {
   293  		return err
   294  	}
   295  	defer f.Close()
   296  
   297  	if log.CLIEnabled() {
   298  		log.CLI.Printf("extracting pages from %s into %s/ ...\n", inFile, outDir)
   299  	}
   300  
   301  	return ExtractPages(f, outDir, filepath.Base(inFile), selectedPages, conf)
   302  }
   303  
   304  // ExtractContent dumps "PDF source" files from rs into outDir for selected pages.
   305  func ExtractContent(rs io.ReadSeeker, outDir, fileName string, selectedPages []string, conf *model.Configuration) error {
   306  	if rs == nil {
   307  		return errors.New("pdfcpu: ExtractContent: missing rs")
   308  	}
   309  
   310  	if conf == nil {
   311  		conf = model.NewDefaultConfiguration()
   312  	}
   313  	conf.Cmd = model.EXTRACTCONTENT
   314  
   315  	ctx, err := ReadValidateAndOptimize(rs, conf)
   316  	if err != nil {
   317  		return err
   318  	}
   319  
   320  	pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true)
   321  	if err != nil {
   322  		return err
   323  	}
   324  
   325  	fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf")
   326  
   327  	for p, v := range pages {
   328  		if !v {
   329  			continue
   330  		}
   331  
   332  		r, err := pdfcpu.ExtractPageContent(ctx, p)
   333  		if err != nil {
   334  			return err
   335  		}
   336  		if r == nil {
   337  			continue
   338  		}
   339  
   340  		outFile := filepath.Join(outDir, fmt.Sprintf("%s_Content_page_%d.txt", fileName, p))
   341  		logWritingTo(outFile)
   342  		f, err := os.Create(outFile)
   343  		if err != nil {
   344  			return err
   345  		}
   346  
   347  		if _, err = io.Copy(f, r); err != nil {
   348  			return err
   349  		}
   350  
   351  		if err := f.Close(); err != nil {
   352  			return err
   353  		}
   354  	}
   355  
   356  	return nil
   357  }
   358  
   359  // ExtractContentFile dumps "PDF source" files from inFile into outDir for selected pages.
   360  func ExtractContentFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error {
   361  	f, err := os.Open(inFile)
   362  	if err != nil {
   363  		return err
   364  	}
   365  	defer f.Close()
   366  
   367  	if log.CLIEnabled() {
   368  		log.CLI.Printf("extracting content from %s into %s/ ...\n", inFile, outDir)
   369  	}
   370  
   371  	return ExtractContent(f, outDir, inFile, selectedPages, conf)
   372  }
   373  
   374  // ExtractMetadata dumps all metadata dict entries for rs into outDir.
   375  func ExtractMetadata(rs io.ReadSeeker, outDir, fileName string, conf *model.Configuration) error {
   376  	if rs == nil {
   377  		return errors.New("pdfcpu: ExtractMetadata: missing rs")
   378  	}
   379  
   380  	if conf == nil {
   381  		conf = model.NewDefaultConfiguration()
   382  	}
   383  	conf.Cmd = model.EXTRACTMETADATA
   384  
   385  	ctx, err := ReadValidateAndOptimize(rs, conf)
   386  	if err != nil {
   387  		return err
   388  	}
   389  
   390  	mm, err := pdfcpu.ExtractMetadata(ctx)
   391  	if err != nil {
   392  		return err
   393  	}
   394  
   395  	if len(mm) > 0 {
   396  		fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf")
   397  		for _, m := range mm {
   398  			outFile := filepath.Join(outDir, fmt.Sprintf("%s_Metadata_%s_%d_%d.txt", fileName, m.ParentType, m.ParentObjNr, m.ObjNr))
   399  			logWritingTo(outFile)
   400  			f, err := os.Create(outFile)
   401  			if err != nil {
   402  				return err
   403  			}
   404  			if _, err = io.Copy(f, m); err != nil {
   405  				return err
   406  			}
   407  			if err := f.Close(); err != nil {
   408  				return err
   409  			}
   410  		}
   411  	}
   412  
   413  	return nil
   414  }
   415  
   416  // ExtractMetadataFile dumps all metadata dict entries for inFile into outDir.
   417  func ExtractMetadataFile(inFile, outDir string, conf *model.Configuration) error {
   418  	f, err := os.Open(inFile)
   419  	if err != nil {
   420  		return err
   421  	}
   422  	defer f.Close()
   423  
   424  	if log.CLIEnabled() {
   425  		log.CLI.Printf("extracting metadata from %s into %s/ ...\n", inFile, outDir)
   426  	}
   427  
   428  	return ExtractMetadata(f, outDir, filepath.Base(inFile), conf)
   429  }