github.com/pdfcpu/pdfcpu@v0.11.1/pkg/api/test/extract_test.go (about) 1 /* 2 Copyright 2020 The pdf Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package test 18 19 import ( 20 "fmt" 21 "io" 22 "os" 23 "path/filepath" 24 "strings" 25 "testing" 26 27 "github.com/pdfcpu/pdfcpu/pkg/api" 28 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu" 29 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" 30 ) 31 32 func TestExtractImages(t *testing.T) { 33 msg := "TestExtractImages" 34 // Extract images for all pages into outDir. 35 for _, fn := range []string{"5116.DCT_Filter.pdf", "testImage.pdf", "go.pdf"} { 36 // Test writing files 37 fn = filepath.Join(inDir, fn) 38 if err := api.ExtractImagesFile(fn, outDir, nil, nil); err != nil { 39 t.Fatalf("%s %s: %v\n", msg, fn, err) 40 } 41 } 42 // Extract images for inFile starting with page 1 into outDir. 43 inFile := filepath.Join(inDir, "testImage.pdf") 44 if err := api.ExtractImagesFile(inFile, outDir, []string{"1-"}, nil); err != nil { 45 t.Fatalf("%s %s: %v\n", msg, inFile, err) 46 } 47 } 48 49 func compare(t *testing.T, fn1, fn2 string) { 50 51 f1, err := os.Open(fn1) 52 if err != nil { 53 t.Errorf("%s: %v", fn1, err) 54 return 55 } 56 defer f1.Close() 57 58 bb1, err := io.ReadAll(f1) 59 if err != nil { 60 t.Errorf("%s: %v", fn1, err) 61 return 62 } 63 64 f2, err := os.Open(fn2) 65 if err != nil { 66 t.Errorf("%s: %v", fn2, err) 67 return 68 } 69 defer f1.Close() 70 71 bb2, err := io.ReadAll(f2) 72 if err != nil { 73 t.Errorf("%s: %v", fn2, err) 74 return 75 } 76 77 if len(bb1) != len(bb2) { 78 t.Errorf("%s <-> %s: length mismatch %d != %d", fn1, fn2, len(bb1), len(bb2)) 79 return 80 } 81 82 for i := 0; i < len(bb1); i++ { 83 if bb1[i] != bb2[i] { 84 t.Errorf("%s <-> %s: mismatch at %d, 0x%02x != 0x%02x\n", fn1, fn2, i, bb1[i], bb2[i]) 85 return 86 } 87 } 88 89 } 90 91 func TestExtractImagesSoftMasks(t *testing.T) { 92 inFile := filepath.Join(inDir, "VectorApple.pdf") 93 ctx, err := api.ReadContextFile(inFile) 94 if err != nil { 95 t.Fatal(err) 96 } 97 98 images := make(map[int]*types.StreamDict) 99 for objId, obj := range ctx.XRefTable.Table { 100 if obj != nil { 101 if dict, ok := obj.Object.(types.StreamDict); ok { 102 if subtype := dict.Dict.NameEntry("Subtype"); subtype != nil && *subtype == "Image" { 103 images[objId] = &dict 104 } 105 } 106 } 107 } 108 109 expected := map[int]string{ 110 36: "VectorApple_36.tif", // IndexedCMYK w/ softmask 111 245: "VectorApple_245.tif", // DeviceCMYK w/ softmask 112 } 113 114 for objId, filename := range expected { 115 sd := images[objId] 116 117 if err := sd.Decode(); err != nil { 118 t.Fatal(err) 119 } 120 121 tmpFileName := filepath.Join(outDir, filename) 122 fmt.Printf("tmpFileName: %s\n", tmpFileName) 123 124 // Write the image object (as TIFF file) to disk. 125 // fn1 is the resulting fileName path including the suffix (aka filetype extension). 126 fn1, err := pdfcpu.WriteImage(ctx.XRefTable, tmpFileName, sd, false, objId) 127 if err != nil { 128 t.Fatalf("err: %v\n", err) 129 } 130 131 fn2 := filepath.Join(resDir, filename) 132 133 compare(t, fn1, fn2) 134 } 135 } 136 137 func TestExtractImagesLowLevel(t *testing.T) { 138 msg := "TestExtractImagesLowLevel" 139 fileName := "testImage.pdf" 140 inFile := filepath.Join(inDir, fileName) 141 142 // Create a context. 143 ctx, err := api.ReadContextFile(inFile) 144 if err != nil { 145 t.Fatalf("%s readContext: %v\n", msg, err) 146 } 147 148 // Optimize resource usage of this context. 149 if err := api.OptimizeContext(ctx); err != nil { 150 t.Fatalf("%s optimizeContext: %v\n", msg, err) 151 } 152 153 // Extract images for page 1. 154 i := 1 155 ii, err := pdfcpu.ExtractPageImages(ctx, i, false) 156 if err != nil { 157 t.Fatalf("%s extractPageFonts(%d): %v\n", msg, i, err) 158 } 159 160 baseFileName := strings.TrimSuffix(filepath.Base(fileName), ".pdf") 161 162 // Process extracted images. 163 for _, img := range ii { 164 fn := filepath.Join(outDir, fmt.Sprintf("%s_%d_%s.%s", baseFileName, i, img.Name, img.FileType)) 165 if err := pdfcpu.WriteReader(fn, img); err != nil { 166 t.Fatalf("%s write: %s", msg, fn) 167 } 168 } 169 } 170 171 func TestExtractFonts(t *testing.T) { 172 msg := "TestExtractFonts" 173 // Extract fonts for all pages into outDir. 174 for _, fn := range []string{"5116.DCT_Filter.pdf", "testImage.pdf", "go.pdf"} { 175 fn = filepath.Join(inDir, fn) 176 if err := api.ExtractFontsFile(fn, outDir, nil, nil); err != nil { 177 t.Fatalf("%s %s: %v\n", msg, fn, err) 178 } 179 } 180 // Extract fonts for inFile for pages 1-3 into outDir. 181 inFile := filepath.Join(inDir, "go.pdf") 182 if err := api.ExtractFontsFile(inFile, outDir, []string{"1-3"}, nil); err != nil { 183 t.Fatalf("%s %s: %v\n", msg, inFile, err) 184 } 185 } 186 187 func TestExtractFontsLowLevel(t *testing.T) { 188 msg := "TestExtractFontsLowLevel" 189 inFile := filepath.Join(inDir, "go.pdf") 190 191 // Create a context. 192 ctx, err := api.ReadContextFile(inFile) 193 if err != nil { 194 t.Fatalf("%s readContext: %v\n", msg, err) 195 } 196 197 // Optimize resource usage of this context. 198 if err := api.OptimizeContext(ctx); err != nil { 199 t.Fatalf("%s optimizeContext: %v\n", msg, err) 200 } 201 202 // Extract fonts for page 1. 203 i := 1 204 ff, err := pdfcpu.ExtractPageFonts(ctx, 1, types.IntSet{}, types.IntSet{}) 205 if err != nil { 206 t.Fatalf("%s extractPageFonts(%d): %v\n", msg, i, err) 207 } 208 209 // Process extracted fonts. 210 for _, f := range ff { 211 fn := filepath.Join(outDir, fmt.Sprintf("%s.%s", f.Name, f.Type)) 212 if err := pdfcpu.WriteReader(fn, f); err != nil { 213 t.Fatalf("%s write: %s", msg, fn) 214 } 215 } 216 } 217 218 func TestExtractPages(t *testing.T) { 219 msg := "TestExtractPages" 220 // Extract page #1 into outDir. 221 inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf") 222 if err := api.ExtractPagesFile(inFile, outDir, []string{"1"}, nil); err != nil { 223 t.Fatalf("%s %s: %v\n", msg, inFile, err) 224 } 225 } 226 227 func TestExtractPagesLowLevel(t *testing.T) { 228 msg := "TestExtractPagesLowLevel" 229 inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf") 230 outFile := "MyExtractedAndProcessedSinglePage.pdf" 231 232 // Create a context. 233 ctx, err := api.ReadContextFile(inFile) 234 if err != nil { 235 t.Fatalf("%s readContext: %v\n", msg, err) 236 } 237 238 // Extract page 1. 239 i := 1 240 241 r, err := api.ExtractPage(ctx, i) 242 if err != nil { 243 t.Fatalf("%s extractPage(%d): %v\n", msg, i, err) 244 } 245 246 if err := api.WritePage(r, outDir, outFile, i); err != nil { 247 t.Fatalf("%s writePage(%d): %v\n", msg, i, err) 248 } 249 250 } 251 252 func TestExtractContent(t *testing.T) { 253 msg := "TestExtractContent" 254 // Extract content of all pages into outDir. 255 inFile := filepath.Join(inDir, "5116.DCT_Filter.pdf") 256 if err := api.ExtractContentFile(inFile, outDir, nil, nil); err != nil { 257 t.Fatalf("%s %s: %v\n", msg, inFile, err) 258 } 259 } 260 261 func TestExtractContentLowLevel(t *testing.T) { 262 msg := "TestExtractContentLowLevel" 263 inFile := filepath.Join(inDir, "5116.DCT_Filter.pdf") 264 265 // Create a context. 266 ctx, err := api.ReadContextFile(inFile) 267 if err != nil { 268 t.Fatalf("%s read context: %v\n", msg, err) 269 } 270 271 // Extract page content for page 2. 272 i := 2 273 r, err := pdfcpu.ExtractPageContent(ctx, i) 274 if err != nil { 275 t.Fatalf("%s extractPageContent(%d): %v\n", msg, i, err) 276 } 277 278 // Process page content. 279 bb, err := io.ReadAll(r) 280 if err != nil { 281 t.Fatalf("%s readAll: %v\n", msg, err) 282 } 283 t.Logf("Page content (PDF-syntax) for page %d:\n%s", i, string(bb)) 284 } 285 286 func TestExtractMetadata(t *testing.T) { 287 msg := "TestExtractMetadata" 288 // Extract all metadata into outDir. 289 inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf") 290 if err := api.ExtractMetadataFile(inFile, outDir, nil); err != nil { 291 t.Fatalf("%s %s: %v\n", msg, inFile, err) 292 } 293 } 294 295 func TestExtractMetadataLowLevel(t *testing.T) { 296 msg := "TestExtractMedadataLowLevel" 297 inFile := filepath.Join(inDir, "TheGoProgrammingLanguageCh1.pdf") 298 299 // Create a context. 300 ctx, err := api.ReadContextFile(inFile) 301 if err != nil { 302 t.Fatalf("%s readContext: %v\n", msg, err) 303 } 304 305 // Extract all metadata. 306 mm, err := pdfcpu.ExtractMetadata(ctx) 307 if err != nil { 308 t.Fatalf("%s ExtractMetadata: %v\n", msg, err) 309 } 310 311 // Process metadata. 312 for _, md := range mm { 313 bb, err := io.ReadAll(md) 314 if err != nil { 315 t.Fatalf("%s metadata readAll: %v\n", msg, err) 316 } 317 t.Logf("Metadata: objNr=%d parentDictObjNr=%d parentDictType=%s\n%s\n", 318 md.ObjNr, md.ParentObjNr, md.ParentType, string(bb)) 319 } 320 }