github.com/pdfcpu/pdfcpu@v0.11.1/pkg/api/extract.go (about) 1 /* 2 Copyright 2019 The pdfcpu Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package api 18 19 import ( 20 "bytes" 21 "fmt" 22 "io" 23 "os" 24 "path/filepath" 25 "sort" 26 "strconv" 27 "strings" 28 29 "github.com/pdfcpu/pdfcpu/pkg/log" 30 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu" 31 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" 32 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" 33 "github.com/pkg/errors" 34 ) 35 36 // ExtractImagesRaw returns []pdfcpu.Image containing io.Readers for images contained in selectedPages. 37 // Beware of memory intensive returned slice. 38 func ExtractImagesRaw(rs io.ReadSeeker, selectedPages []string, conf *model.Configuration) ([]map[int]model.Image, error) { 39 if rs == nil { 40 return nil, errors.New("pdfcpu: ExtractImages: missing rs") 41 } 42 43 if conf == nil { 44 conf = model.NewDefaultConfiguration() 45 } 46 conf.Cmd = model.EXTRACTIMAGES 47 48 ctx, err := ReadValidateAndOptimize(rs, conf) 49 if err != nil { 50 return nil, err 51 } 52 53 pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true) 54 if err != nil { 55 return nil, err 56 } 57 58 var images []map[int]model.Image 59 for i, v := range pages { 60 if !v { 61 continue 62 } 63 mm, err := pdfcpu.ExtractPageImages(ctx, i, false) 64 if err != nil { 65 return nil, err 66 } 67 images = append(images, mm) 68 } 69 70 return images, nil 71 } 72 73 // ExtractImages extracts and digests embedded image resources from rs for selected pages. 74 func ExtractImages(rs io.ReadSeeker, selectedPages []string, digestImage func(model.Image, bool, int) error, conf *model.Configuration) error { 75 if rs == nil { 76 return errors.New("pdfcpu: ExtractImages: missing rs") 77 } 78 79 if conf == nil { 80 conf = model.NewDefaultConfiguration() 81 } 82 conf.Cmd = model.EXTRACTIMAGES 83 84 ctx, err := ReadValidateAndOptimize(rs, conf) 85 if err != nil { 86 return err 87 } 88 89 pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true) 90 if err != nil { 91 return err 92 } 93 94 pageNrs := []int{} 95 for k, v := range pages { 96 if !v { 97 continue 98 } 99 pageNrs = append(pageNrs, k) 100 } 101 102 sort.Ints(pageNrs) 103 maxPageDigits := len(strconv.Itoa(pageNrs[len(pageNrs)-1])) 104 105 for _, i := range pageNrs { 106 mm, err := pdfcpu.ExtractPageImages(ctx, i, false) 107 if err != nil { 108 return err 109 } 110 singleImgPerPage := len(mm) == 1 111 for _, img := range mm { 112 if err := digestImage(img, singleImgPerPage, maxPageDigits); err != nil { 113 return err 114 } 115 } 116 } 117 118 return nil 119 } 120 121 // ExtractImagesFile dumps embedded image resources from inFile into outDir for selected pages. 122 func ExtractImagesFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error { 123 f, err := os.Open(inFile) 124 if err != nil { 125 return err 126 } 127 defer f.Close() 128 129 if log.CLIEnabled() { 130 log.CLI.Printf("extracting images from %s into %s/ ...\n", inFile, outDir) 131 } 132 fileName := strings.TrimSuffix(filepath.Base(inFile), ".pdf") 133 134 return ExtractImages(f, selectedPages, pdfcpu.WriteImageToDisk(outDir, fileName), conf) 135 } 136 137 func writeFonts(ff []pdfcpu.Font, outDir, fileName string) error { 138 for _, f := range ff { 139 outFile := filepath.Join(outDir, fmt.Sprintf("%s_%s.%s", fileName, f.Name, f.Type)) 140 logWritingTo(outFile) 141 w, err := os.Create(outFile) 142 if err != nil { 143 return err 144 } 145 if _, err = io.Copy(w, f); err != nil { 146 return err 147 } 148 if err := w.Close(); err != nil { 149 return err 150 } 151 } 152 153 return nil 154 } 155 156 // ExtractFonts dumps embedded fontfiles from rs into outDir for selected pages. 157 func ExtractFonts(rs io.ReadSeeker, outDir, fileName string, selectedPages []string, conf *model.Configuration) error { 158 if rs == nil { 159 return errors.New("pdfcpu: ExtractFonts: missing rs") 160 } 161 162 if conf == nil { 163 conf = model.NewDefaultConfiguration() 164 } 165 conf.Cmd = model.EXTRACTFONTS 166 167 ctx, err := ReadValidateAndOptimize(rs, conf) 168 if err != nil { 169 return err 170 } 171 172 pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true) 173 if err != nil { 174 return err 175 } 176 177 fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf") 178 179 objNrs, skipped := types.IntSet{}, types.IntSet{} 180 181 for i, v := range pages { 182 if !v { 183 continue 184 } 185 ff, err := pdfcpu.ExtractPageFonts(ctx, i, objNrs, skipped) 186 if err != nil { 187 return err 188 } 189 if err := writeFonts(ff, outDir, fileName); err != nil { 190 return err 191 } 192 } 193 194 ff, err := pdfcpu.ExtractFormFonts(ctx) 195 if err != nil { 196 return err 197 } 198 199 return writeFonts(ff, outDir, fileName) 200 } 201 202 // ExtractFontsFile dumps embedded fontfiles from inFile into outDir for selected pages. 203 func ExtractFontsFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error { 204 f, err := os.Open(inFile) 205 if err != nil { 206 return err 207 } 208 defer f.Close() 209 210 if log.CLIEnabled() { 211 log.CLI.Printf("extracting fonts from %s into %s/ ...\n", inFile, outDir) 212 } 213 214 return ExtractFonts(f, outDir, filepath.Base(inFile), selectedPages, conf) 215 } 216 217 // WritePage consumes an io.Reader containing some PDF bytes and writes to outDir/fileName. 218 func WritePage(r io.Reader, outDir, fileName string, pageNr int) error { 219 outFile := filepath.Join(outDir, fmt.Sprintf("%s_page_%d.pdf", fileName, pageNr)) 220 logWritingTo(outFile) 221 w, err := os.Create(outFile) 222 if err != nil { 223 return err 224 } 225 if _, err = io.Copy(w, r); err != nil { 226 return err 227 } 228 return w.Close() 229 } 230 231 // ExtractPage extracts the page with pageNr out of ctx into an io.Reader. 232 func ExtractPage(ctx *model.Context, pageNr int) (io.Reader, error) { 233 ctxNew, err := pdfcpu.ExtractPages(ctx, []int{pageNr}, false) 234 if err != nil { 235 return nil, err 236 } 237 238 var b bytes.Buffer 239 if err := WriteContext(ctxNew, &b); err != nil { 240 return nil, err 241 } 242 243 return &b, nil 244 } 245 246 // ExtractPages generates single page PDF files from rs in outDir for selected pages. 247 func ExtractPages(rs io.ReadSeeker, outDir, fileName string, selectedPages []string, conf *model.Configuration) error { 248 if rs == nil { 249 return errors.New("pdfcpu: ExtractPages: missing rs") 250 } 251 252 if conf == nil { 253 conf = model.NewDefaultConfiguration() 254 } 255 conf.Cmd = model.EXTRACTPAGES 256 257 ctx, err := ReadValidateAndOptimize(rs, conf) 258 if err != nil { 259 return err 260 } 261 262 pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true) 263 if err != nil { 264 return err 265 } 266 267 if len(pages) == 0 { 268 if log.CLIEnabled() { 269 log.CLI.Println("aborted: missing page numbers!") 270 } 271 return nil 272 } 273 274 fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf") 275 276 for _, i := range sortedPages(pages) { 277 r, err := ExtractPage(ctx, i) 278 if err != nil { 279 return err 280 } 281 if err := WritePage(r, outDir, fileName, i); err != nil { 282 return err 283 } 284 } 285 286 return nil 287 } 288 289 // ExtractPagesFile generates single page PDF files from inFile in outDir for selected pages. 290 func ExtractPagesFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error { 291 f, err := os.Open(inFile) 292 if err != nil { 293 return err 294 } 295 defer f.Close() 296 297 if log.CLIEnabled() { 298 log.CLI.Printf("extracting pages from %s into %s/ ...\n", inFile, outDir) 299 } 300 301 return ExtractPages(f, outDir, filepath.Base(inFile), selectedPages, conf) 302 } 303 304 // ExtractContent dumps "PDF source" files from rs into outDir for selected pages. 305 func ExtractContent(rs io.ReadSeeker, outDir, fileName string, selectedPages []string, conf *model.Configuration) error { 306 if rs == nil { 307 return errors.New("pdfcpu: ExtractContent: missing rs") 308 } 309 310 if conf == nil { 311 conf = model.NewDefaultConfiguration() 312 } 313 conf.Cmd = model.EXTRACTCONTENT 314 315 ctx, err := ReadValidateAndOptimize(rs, conf) 316 if err != nil { 317 return err 318 } 319 320 pages, err := PagesForPageSelection(ctx.PageCount, selectedPages, true, true) 321 if err != nil { 322 return err 323 } 324 325 fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf") 326 327 for p, v := range pages { 328 if !v { 329 continue 330 } 331 332 r, err := pdfcpu.ExtractPageContent(ctx, p) 333 if err != nil { 334 return err 335 } 336 if r == nil { 337 continue 338 } 339 340 outFile := filepath.Join(outDir, fmt.Sprintf("%s_Content_page_%d.txt", fileName, p)) 341 logWritingTo(outFile) 342 f, err := os.Create(outFile) 343 if err != nil { 344 return err 345 } 346 347 if _, err = io.Copy(f, r); err != nil { 348 return err 349 } 350 351 if err := f.Close(); err != nil { 352 return err 353 } 354 } 355 356 return nil 357 } 358 359 // ExtractContentFile dumps "PDF source" files from inFile into outDir for selected pages. 360 func ExtractContentFile(inFile, outDir string, selectedPages []string, conf *model.Configuration) error { 361 f, err := os.Open(inFile) 362 if err != nil { 363 return err 364 } 365 defer f.Close() 366 367 if log.CLIEnabled() { 368 log.CLI.Printf("extracting content from %s into %s/ ...\n", inFile, outDir) 369 } 370 371 return ExtractContent(f, outDir, inFile, selectedPages, conf) 372 } 373 374 // ExtractMetadata dumps all metadata dict entries for rs into outDir. 375 func ExtractMetadata(rs io.ReadSeeker, outDir, fileName string, conf *model.Configuration) error { 376 if rs == nil { 377 return errors.New("pdfcpu: ExtractMetadata: missing rs") 378 } 379 380 if conf == nil { 381 conf = model.NewDefaultConfiguration() 382 } 383 conf.Cmd = model.EXTRACTMETADATA 384 385 ctx, err := ReadValidateAndOptimize(rs, conf) 386 if err != nil { 387 return err 388 } 389 390 mm, err := pdfcpu.ExtractMetadata(ctx) 391 if err != nil { 392 return err 393 } 394 395 if len(mm) > 0 { 396 fileName = strings.TrimSuffix(filepath.Base(fileName), ".pdf") 397 for _, m := range mm { 398 outFile := filepath.Join(outDir, fmt.Sprintf("%s_Metadata_%s_%d_%d.txt", fileName, m.ParentType, m.ParentObjNr, m.ObjNr)) 399 logWritingTo(outFile) 400 f, err := os.Create(outFile) 401 if err != nil { 402 return err 403 } 404 if _, err = io.Copy(f, m); err != nil { 405 return err 406 } 407 if err := f.Close(); err != nil { 408 return err 409 } 410 } 411 } 412 413 return nil 414 } 415 416 // ExtractMetadataFile dumps all metadata dict entries for inFile into outDir. 417 func ExtractMetadataFile(inFile, outDir string, conf *model.Configuration) error { 418 f, err := os.Open(inFile) 419 if err != nil { 420 return err 421 } 422 defer f.Close() 423 424 if log.CLIEnabled() { 425 log.CLI.Printf("extracting metadata from %s into %s/ ...\n", inFile, outDir) 426 } 427 428 return ExtractMetadata(f, outDir, filepath.Base(inFile), conf) 429 }