github.com/pdfcpu/pdfcpu@v0.11.1/pkg/api/split.go (about) 1 /* 2 Copyright 2020 The pdfcpu Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package api 18 19 import ( 20 "bytes" 21 "io" 22 "os" 23 "path/filepath" 24 "strconv" 25 "strings" 26 27 "github.com/pdfcpu/pdfcpu/pkg/log" 28 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu" 29 "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" 30 "github.com/pkg/errors" 31 ) 32 33 type PageSpan struct { 34 From int 35 Thru int 36 Reader io.Reader 37 } 38 39 func pageSpan(ctx *model.Context, from, thru int) (*PageSpan, error) { 40 ctxNew, err := pdfcpu.ExtractPages(ctx, PagesForPageRange(from, thru), false) 41 if err != nil { 42 return nil, err 43 } 44 45 var b bytes.Buffer 46 if err := WriteContext(ctxNew, &b); err != nil { 47 return nil, err 48 } 49 50 return &PageSpan{From: from, Thru: thru, Reader: &b}, nil 51 } 52 53 func spanFileName(fileName string, from, thru int) string { 54 baseFileName := filepath.Base(fileName) 55 fn := strings.TrimSuffix(baseFileName, ".pdf") 56 fn = fn + "_" + strconv.Itoa(from) 57 if from == thru { 58 return fn + ".pdf" 59 } 60 return fn + "-" + strconv.Itoa(thru) + ".pdf" 61 } 62 63 func splitOutPath(outDir, fileName string, forBookmark bool, from, thru int) string { 64 p := filepath.Join(outDir, fileName+".pdf") 65 if !forBookmark { 66 p = filepath.Join(outDir, spanFileName(fileName, from, thru)) 67 } 68 return p 69 } 70 71 func writePageSpan(ctx *model.Context, from, thru int, outPath string) error { 72 ps, err := pageSpan(ctx, from, thru) 73 if err != nil { 74 return err 75 } 76 logWritingTo(outPath) 77 return pdfcpu.WriteReader(outPath, ps.Reader) 78 } 79 80 func context(rs io.ReadSeeker, conf *model.Configuration) (*model.Context, error) { 81 if conf == nil { 82 conf = model.NewDefaultConfiguration() 83 } 84 conf.Cmd = model.SPLIT 85 86 return ReadValidateAndOptimize(rs, conf) 87 } 88 89 func pageSpansSplitAlongBookmarks(ctx *model.Context) ([]*PageSpan, error) { 90 pss := []*PageSpan{} 91 92 bms, err := pdfcpu.Bookmarks(ctx) 93 if err != nil { 94 return nil, err 95 } 96 97 for _, bm := range bms { 98 99 from, thru := bm.PageFrom, bm.PageThru 100 if thru == 0 { 101 thru = ctx.PageCount 102 } 103 104 ps, err := pageSpan(ctx, from, thru) 105 if err != nil { 106 return nil, err 107 } 108 pss = append(pss, ps) 109 110 } 111 112 return pss, nil 113 } 114 115 func pageSpans(ctx *model.Context, span int) ([]*PageSpan, error) { 116 pss := []*PageSpan{} 117 118 for i := 0; i < ctx.PageCount/span; i++ { 119 start := i * span 120 from := start + 1 121 thru := start + span 122 ps, err := pageSpan(ctx, from, thru) 123 if err != nil { 124 return nil, err 125 } 126 pss = append(pss, ps) 127 } 128 129 // A possible last file has less than span pages. 130 if ctx.PageCount%span > 0 { 131 start := (ctx.PageCount / span) * span 132 from := start + 1 133 thru := ctx.PageCount 134 ps, err := pageSpan(ctx, from, thru) 135 if err != nil { 136 return nil, err 137 } 138 pss = append(pss, ps) 139 } 140 141 return pss, nil 142 } 143 144 func writePageSpans(ctx *model.Context, span int, outDir, fileName string) error { 145 forBookmark := false 146 147 for i := 0; i < ctx.PageCount/span; i++ { 148 start := i * span 149 from, thru := start+1, start+span 150 path := splitOutPath(outDir, fileName, forBookmark, from, thru) 151 if err := writePageSpan(ctx, from, thru, path); err != nil { 152 return err 153 } 154 } 155 156 // A possible last file has less than span pages. 157 if ctx.PageCount%span > 0 { 158 start := (ctx.PageCount / span) * span 159 from, thru := start+1, ctx.PageCount 160 path := splitOutPath(outDir, fileName, forBookmark, from, thru) 161 if err := writePageSpan(ctx, from, thru, path); err != nil { 162 return err 163 } 164 } 165 166 return nil 167 } 168 169 func writePageSpansSplitAlongBookmarks(ctx *model.Context, outDir string) error { 170 forBookmark := true 171 172 bms, err := pdfcpu.Bookmarks(ctx) 173 if err != nil { 174 return err 175 } 176 177 for _, bm := range bms { 178 fileName := strings.Replace(bm.Title, " ", "_", -1) 179 from, thru := bm.PageFrom, bm.PageThru 180 if thru == 0 { 181 thru = ctx.PageCount 182 } 183 path := splitOutPath(outDir, fileName, forBookmark, from, thru) 184 if err := writePageSpan(ctx, from, thru, path); err != nil { 185 return err 186 } 187 } 188 189 return nil 190 } 191 192 func writePageSpansSplitAlongPages(ctx *model.Context, pageNrs []int, outDir, fileName string) error { 193 // pageNumbers is a a sorted sequence of page numbers. 194 forBookmark := false 195 from, thru := 1, 0 196 197 if len(pageNrs) < 1 { 198 return errors.New("pdfcpu: split along pageNrs - missing pageNrs") 199 } 200 201 if pageNrs[0] > ctx.PageCount { 202 return errors.New("pdfcpu: split along pageNrs - invalid page number sequence.") 203 } 204 205 for i := 0; i < len(pageNrs); i++ { 206 thru = pageNrs[i] - 1 207 if thru >= ctx.PageCount { 208 break 209 } 210 path := splitOutPath(outDir, fileName, forBookmark, from, thru) 211 if err := writePageSpan(ctx, from, thru, path); err != nil { 212 return err 213 } 214 from = thru + 1 215 } 216 217 thru = ctx.PageCount 218 path := splitOutPath(outDir, fileName, forBookmark, from, thru) 219 return writePageSpan(ctx, from, thru, path) 220 } 221 222 // SplitRaw returns page spans for the PDF stream read from rs obeying given split span. 223 // If span == 1 splitting results in single page PDFs. 224 // If span == 0 we split along given bookmarks (level 1 only). 225 // Default span: 1 226 func SplitRaw(rs io.ReadSeeker, span int, conf *model.Configuration) ([]*PageSpan, error) { 227 if rs == nil { 228 return nil, errors.New("pdfcpu: SplitRaw: missing rs") 229 } 230 231 ctx, err := context(rs, conf) 232 if err != nil { 233 return nil, err 234 } 235 236 if span == 0 { 237 return pageSpansSplitAlongBookmarks(ctx) 238 } 239 return pageSpans(ctx, span) 240 } 241 242 // Split generates a sequence of PDF files in outDir for the PDF stream read from rs obeying given split span. 243 // If span == 1 splitting results in single page PDFs. 244 // If span == 0 we split along given bookmarks (level 1 only). 245 // Default span: 1 246 func Split(rs io.ReadSeeker, outDir, fileName string, span int, conf *model.Configuration) error { 247 if rs == nil { 248 return errors.New("pdfcpu: Split: missing rs") 249 } 250 251 ctx, err := context(rs, conf) 252 if err != nil { 253 return err 254 } 255 256 if span == 0 { 257 return writePageSpansSplitAlongBookmarks(ctx, outDir) 258 } 259 return writePageSpans(ctx, span, outDir, fileName) 260 } 261 262 // SplitFile generates a sequence of PDF files in outDir for inFile obeying given split span. 263 // If span == 1 splitting results in single page PDFs. 264 // If span == 0 we split along given bookmarks (level 1 only). 265 // Default span: 1 266 func SplitFile(inFile, outDir string, span int, conf *model.Configuration) error { 267 f, err := os.Open(inFile) 268 if err != nil { 269 return err 270 } 271 if log.CLIEnabled() { 272 log.CLI.Printf("splitting %s to %s/...\n", inFile, outDir) 273 } 274 275 defer func() { 276 if err != nil { 277 f.Close() 278 return 279 } 280 err = f.Close() 281 }() 282 283 return Split(f, outDir, filepath.Base(inFile), span, conf) 284 } 285 286 // SplitFile generates a sequence of PDF files in outDir for rs splitting along pageNrs. 287 func SplitByPageNr(rs io.ReadSeeker, outDir, fileName string, pageNrs []int, conf *model.Configuration) error { 288 if rs == nil { 289 return errors.New("pdfcpu: SplitByPageNr: missing rs") 290 } 291 292 ctx, err := context(rs, conf) 293 if err != nil { 294 return err 295 } 296 297 return writePageSpansSplitAlongPages(ctx, pageNrs, outDir, fileName) 298 } 299 300 // SplitFile generates a sequence of PDF files in outDir for inFile splitting it along pageNrs. 301 func SplitByPageNrFile(inFile, outDir string, pageNrs []int, conf *model.Configuration) error { 302 f, err := os.Open(inFile) 303 if err != nil { 304 return err 305 } 306 if log.CLIEnabled() { 307 log.CLI.Printf("splitting %s to %s/...\n", inFile, outDir) 308 } 309 310 defer func() { 311 if err != nil { 312 f.Close() 313 return 314 } 315 err = f.Close() 316 }() 317 318 return SplitByPageNr(f, outDir, filepath.Base(inFile), pageNrs, conf) 319 }