github.com/pdfcpu/pdfcpu@v0.11.1/pkg/api/test/extract_test.go (about)

     1  /*
     2  Copyright 2020 The pdf Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package test
    18  
    19  import (
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"path/filepath"
    24  	"strings"
    25  	"testing"
    26  
    27  	"github.com/pdfcpu/pdfcpu/pkg/api"
    28  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu"
    29  	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
    30  )
    31  
    32  func TestExtractImages(t *testing.T) {
    33  	msg := "TestExtractImages"
    34  	// Extract images for all pages into outDir.
    35  	for _, fn := range []string{"5116.DCT_Filter.pdf", "testImage.pdf", "go.pdf"} {
    36  		// Test writing files
    37  		fn = filepath.Join(inDir, fn)
    38  		if err := api.ExtractImagesFile(fn, outDir, nil, nil); err != nil {
    39  			t.Fatalf("%s %s: %v\n", msg, fn, err)
    40  		}
    41  	}
    42  	// Extract images for inFile starting with page 1 into outDir.
    43  	inFile := filepath.Join(inDir, "testImage.pdf")
    44  	if err := api.ExtractImagesFile(inFile, outDir, []string{"1-"}, nil); err != nil {
    45  		t.Fatalf("%s %s: %v\n", msg, inFile, err)
    46  	}
    47  }
    48  
    49  func compare(t *testing.T, fn1, fn2 string) {
    50  
    51  	f1, err := os.Open(fn1)
    52  	if err != nil {
    53  		t.Errorf("%s: %v", fn1, err)
    54  		return
    55  	}
    56  	defer f1.Close()
    57  
    58  	bb1, err := io.ReadAll(f1)
    59  	if err != nil {
    60  		t.Errorf("%s: %v", fn1, err)
    61  		return
    62  	}
    63  
    64  	f2, err := os.Open(fn2)
    65  	if err != nil {
    66  		t.Errorf("%s: %v", fn2, err)
    67  		return
    68  	}
    69  	defer f1.Close()
    70  
    71  	bb2, err := io.ReadAll(f2)
    72  	if err != nil {
    73  		t.Errorf("%s: %v", fn2, err)
    74  		return
    75  	}
    76  
    77  	if len(bb1) != len(bb2) {
    78  		t.Errorf("%s <-> %s: length mismatch %d != %d", fn1, fn2, len(bb1), len(bb2))
    79  		return
    80  	}
    81  
    82  	for i := 0; i < len(bb1); i++ {
    83  		if bb1[i] != bb2[i] {
    84  			t.Errorf("%s <-> %s: mismatch at %d, 0x%02x != 0x%02x\n", fn1, fn2, i, bb1[i], bb2[i])
    85  			return
    86  		}
    87  	}
    88  
    89  }
    90  
    91  func TestExtractImagesSoftMasks(t *testing.T) {
    92  	inFile := filepath.Join(inDir, "VectorApple.pdf")
    93  	ctx, err := api.ReadContextFile(inFile)
    94  	if err != nil {
    95  		t.Fatal(err)
    96  	}
    97  
    98  	images := make(map[int]*types.StreamDict)
    99  	for objId, obj := range ctx.XRefTable.Table {
   100  		if obj != nil {
   101  			if dict, ok := obj.Object.(types.StreamDict); ok {
   102  				if subtype := dict.Dict.NameEntry("Subtype"); subtype != nil && *subtype == "Image" {
   103  					images[objId] = &dict
   104  				}
   105  			}
   106  		}
   107  	}
   108  
   109  	expected := map[int]string{
   110  		36:  "VectorApple_36.tif",  // IndexedCMYK w/ softmask
   111  		245: "VectorApple_245.tif", // DeviceCMYK w/ softmask
   112  	}
   113  
   114  	for objId, filename := range expected {
   115  		sd := images[objId]
   116  
   117  		if err := sd.Decode(); err != nil {
   118  			t.Fatal(err)
   119  		}
   120  
   121  		tmpFileName := filepath.Join(outDir, filename)
   122  		fmt.Printf("tmpFileName: %s\n", tmpFileName)
   123  
   124  		// Write the image object (as TIFF file) to disk.
   125  		// fn1 is the resulting fileName path including the suffix (aka filetype extension).
   126  		fn1, err := pdfcpu.WriteImage(ctx.XRefTable, tmpFileName, sd, false, objId)
   127  		if err != nil {
   128  			t.Fatalf("err: %v\n", err)
   129  		}
   130  
   131  		fn2 := filepath.Join(resDir, filename)
   132  
   133  		compare(t, fn1, fn2)
   134  	}
   135  }
   136  
   137  func TestExtractImagesLowLevel(t *testing.T) {
   138  	msg := "TestExtractImagesLowLevel"
   139  	fileName := "testImage.pdf"
   140  	inFile := filepath.Join(inDir, fileName)
   141  
   142  	// Create a context.
   143  	ctx, err := api.ReadContextFile(inFile)
   144  	if err != nil {
   145  		t.Fatalf("%s readContext: %v\n", msg, err)
   146  	}
   147  
   148  	// Optimize resource usage of this context.
   149  	if err := api.OptimizeContext(ctx); err != nil {
   150  		t.Fatalf("%s optimizeContext: %v\n", msg, err)
   151  	}
   152  
   153  	// Extract images for page 1.
   154  	i := 1
   155  	ii, err := pdfcpu.ExtractPageImages(ctx, i, false)
   156  	if err != nil {
   157  		t.Fatalf("%s extractPageFonts(%d): %v\n", msg, i, err)
   158  	}
   159  
   160  	baseFileName := strings.TrimSuffix(filepath.Base(fileName), ".pdf")
   161  
   162  	// Process extracted images.
   163  	for _, img := range ii {
   164  		fn := filepath.Join(outDir, fmt.Sprintf("%s_%d_%s.%s", baseFileName, i, img.Name, img.FileType))
   165  		if err := pdfcpu.WriteReader(fn, img); err != nil {
   166  			t.Fatalf("%s write: %s", msg, fn)
   167  		}
   168  	}
   169  }
   170  
   171  func TestExtractFonts(t *testing.T) {
   172  	msg := "TestExtractFonts"
   173  	// Extract fonts for all pages into outDir.
   174  	for _, fn := range []string{"5116.DCT_Filter.pdf", "testImage.pdf", "go.pdf"} {
   175  		fn = filepath.Join(inDir, fn)
   176  		if err := api.ExtractFontsFile(fn, outDir, nil, nil); err != nil {
   177  			t.Fatalf("%s %s: %v\n", msg, fn, err)
   178  		}
   179  	}
   180  	// Extract fonts for inFile for pages 1-3 into outDir.
   181  	inFile := filepath.Join(inDir, "go.pdf")
   182  	if err := api.ExtractFontsFile(inFile, outDir, []string{"1-3"}, nil); err != nil {
   183  		t.Fatalf("%s %s: %v\n", msg, inFile, err)
   184  	}
   185  }
   186  
   187  func TestExtractFontsLowLevel(t *testing.T) {
   188  	msg := "TestExtractFontsLowLevel"
   189  	inFile := filepath.Join(inDir, "go.pdf")
   190  
   191  	// Create a context.
   192  	ctx, err := api.ReadContextFile(inFile)
   193  	if err != nil {
   194  		t.Fatalf("%s readContext: %v\n", msg, err)
   195  	}
   196  
   197  	// Optimize resource usage of this context.
   198  	if err := api.OptimizeContext(ctx); err != nil {
   199  		t.Fatalf("%s optimizeContext: %v\n", msg, err)
   200  	}
   201  
   202  	// Extract fonts for page 1.
   203  	i := 1
   204  	ff, err := pdfcpu.ExtractPageFonts(ctx, 1, types.IntSet{}, types.IntSet{})
   205  	if err != nil {
   206  		t.Fatalf("%s extractPageFonts(%d): %v\n", msg, i, err)
   207  	}
   208  
   209  	// Process extracted fonts.
   210  	for _, f := range ff {
   211  		fn := filepath.Join(outDir, fmt.Sprintf("%s.%s", f.Name, f.Type))
   212  		if err := pdfcpu.WriteReader(fn, f); err != nil {
   213  			t.Fatalf("%s write: %s", msg, fn)
   214  		}
   215  	}
   216  }
   217  
   218  func TestExtractPages(t *testing.T) {
   219  	msg := "TestExtractPages"
   220  	// Extract page #1 into outDir.
   221  	inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf")
   222  	if err := api.ExtractPagesFile(inFile, outDir, []string{"1"}, nil); err != nil {
   223  		t.Fatalf("%s %s: %v\n", msg, inFile, err)
   224  	}
   225  }
   226  
   227  func TestExtractPagesLowLevel(t *testing.T) {
   228  	msg := "TestExtractPagesLowLevel"
   229  	inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf")
   230  	outFile := "MyExtractedAndProcessedSinglePage.pdf"
   231  
   232  	// Create a context.
   233  	ctx, err := api.ReadContextFile(inFile)
   234  	if err != nil {
   235  		t.Fatalf("%s readContext: %v\n", msg, err)
   236  	}
   237  
   238  	// Extract page 1.
   239  	i := 1
   240  
   241  	r, err := api.ExtractPage(ctx, i)
   242  	if err != nil {
   243  		t.Fatalf("%s extractPage(%d): %v\n", msg, i, err)
   244  	}
   245  
   246  	if err := api.WritePage(r, outDir, outFile, i); err != nil {
   247  		t.Fatalf("%s writePage(%d): %v\n", msg, i, err)
   248  	}
   249  
   250  }
   251  
   252  func TestExtractContent(t *testing.T) {
   253  	msg := "TestExtractContent"
   254  	// Extract content of all pages into outDir.
   255  	inFile := filepath.Join(inDir, "5116.DCT_Filter.pdf")
   256  	if err := api.ExtractContentFile(inFile, outDir, nil, nil); err != nil {
   257  		t.Fatalf("%s %s: %v\n", msg, inFile, err)
   258  	}
   259  }
   260  
   261  func TestExtractContentLowLevel(t *testing.T) {
   262  	msg := "TestExtractContentLowLevel"
   263  	inFile := filepath.Join(inDir, "5116.DCT_Filter.pdf")
   264  
   265  	// Create a context.
   266  	ctx, err := api.ReadContextFile(inFile)
   267  	if err != nil {
   268  		t.Fatalf("%s read context: %v\n", msg, err)
   269  	}
   270  
   271  	// Extract page content for page 2.
   272  	i := 2
   273  	r, err := pdfcpu.ExtractPageContent(ctx, i)
   274  	if err != nil {
   275  		t.Fatalf("%s extractPageContent(%d): %v\n", msg, i, err)
   276  	}
   277  
   278  	// Process page content.
   279  	bb, err := io.ReadAll(r)
   280  	if err != nil {
   281  		t.Fatalf("%s readAll: %v\n", msg, err)
   282  	}
   283  	t.Logf("Page content (PDF-syntax) for page %d:\n%s", i, string(bb))
   284  }
   285  
   286  func TestExtractMetadata(t *testing.T) {
   287  	msg := "TestExtractMetadata"
   288  	// Extract all metadata into outDir.
   289  	inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf")
   290  	if err := api.ExtractMetadataFile(inFile, outDir, nil); err != nil {
   291  		t.Fatalf("%s %s: %v\n", msg, inFile, err)
   292  	}
   293  }
   294  
   295  func TestExtractMetadataLowLevel(t *testing.T) {
   296  	msg := "TestExtractMedadataLowLevel"
   297  	inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf")
   298  
   299  	// Create a context.
   300  	ctx, err := api.ReadContextFile(inFile)
   301  	if err != nil {
   302  		t.Fatalf("%s readContext: %v\n", msg, err)
   303  	}
   304  
   305  	// Extract all metadata.
   306  	mm, err := pdfcpu.ExtractMetadata(ctx)
   307  	if err != nil {
   308  		t.Fatalf("%s ExtractMetadata: %v\n", msg, err)
   309  	}
   310  
   311  	// Process metadata.
   312  	for _, md := range mm {
   313  		bb, err := io.ReadAll(md)
   314  		if err != nil {
   315  			t.Fatalf("%s metadata readAll: %v\n", msg, err)
   316  		}
   317  		t.Logf("Metadata: objNr=%d parentDictObjNr=%d parentDictType=%s\n%s\n",
   318  			md.ObjNr, md.ParentObjNr, md.ParentType, string(bb))
   319  	}
   320  }