github.com/signintech/pdft@v0.5.0/pdf_data.go (about) 1 package pdft 2 3 import ( 4 "bytes" 5 "compress/zlib" 6 "errors" 7 "fmt" 8 "io" 9 "strconv" 10 "strings" 11 ) 12 13 // PDFData pdf file data 14 type PDFData struct { 15 trailer TrailerData 16 xrefs []XrefData 17 objIDs []int 18 objs []PDFObjData 19 pagesObj *PDFObjData 20 } 21 22 // TrailerData trailer 23 type TrailerData struct { 24 rootObjID int 25 } 26 27 // Len count 28 func (p *PDFData) Len() int { 29 return len(p.objIDs) 30 } 31 32 func (p *PDFData) put(pdfobj PDFObjData) { 33 p.objIDs = append(p.objIDs, pdfobj.objID) 34 p.objs = append(p.objs, pdfobj) 35 } 36 37 func (p *PDFData) putNewObject(pdfobj PDFObjData) int { 38 newObjID := p.maxID() + 1 39 pdfobj.objID = newObjID 40 p.put(pdfobj) 41 return newObjID 42 } 43 44 func (p *PDFData) removeObjByID(objID int) error { 45 for i, id := range p.objIDs { 46 if id == objID { 47 p.objIDs = append(p.objIDs[:i], p.objIDs[i+1:]...) 48 p.objs = append(p.objs[:i], p.objs[i+1:]...) 49 return nil 50 } 51 } 52 return errors.New("Not Found") 53 } 54 55 // GetObjByID get obj by objid 56 func (p *PDFData) getObjByID(objID int) *PDFObjData { 57 // if pdf exists annotations, it will have multiple same objIDs. So, need find the right one. 58 indexArr := []int{} 59 for i, id := range p.objIDs { 60 if id == objID { 61 indexArr = append(indexArr, i) 62 } 63 } 64 if len(indexArr) == 1 { 65 return &p.objs[indexArr[0]] 66 } else if len(indexArr) > 1 { 67 result := &p.objs[indexArr[0]] 68 for _, i := range indexArr { 69 if props, err := (&p.objs[i]).readProperties(); err == nil && props.getPropByKey("Annots") != nil { 70 result = &p.objs[i] 71 } 72 } 73 return result 74 } 75 return nil 76 } 77 78 // getPageCrawl use crawl, supporting for page nesting 79 func (p *PDFData) getPageCrawl(objID int, path ...string) (*crawl, error) { 80 var cw crawl 81 pagePath := append([]string{"Pages"}, path...) 82 cw.set(p, objID, pagePath...) 83 cw.run() 84 checkedQueue := []int{} 85 for k := range cw.results { 86 checkedQueue = append(checkedQueue, k) 87 } 88 for len(checkedQueue) > 0 { 89 key := checkedQueue[0] 90 if s := cw.results[key].String(); strings.Contains(s, "/Pages") && strings.Contains(s, "/Parent") { 91 var subCw crawl 92 subCw.set(p, key, path...) 93 subCw.run() 94 for k, v := range subCw.results { 95 cw.results[k] = v 96 if _, ok := cw.results[k]; !ok { 97 checkedQueue = append(checkedQueue, k) 98 } 99 } 100 } 101 checkedQueue = checkedQueue[1:] 102 } 103 return &cw, nil 104 } 105 106 // getPageObjIDs get page obj IDs 107 func (p *PDFData) getPageObjIDs() ([]int, error) { 108 results := []int{} 109 rootProps, _ := p.getObjByID(p.trailer.rootObjID).readProperties() 110 rootPagesID, _, _ := rootProps.getPropByKey("Pages").asDictionary() 111 objProps := map[int]*PDFObjPropertiesData{} // cache props 112 getObjProps := func(id int) *PDFObjPropertiesData { 113 if v, ok := objProps[id]; ok { 114 return v 115 } 116 if data, err := p.getObjByID(id).readProperties(); err == nil { 117 objProps[id] = data 118 return objProps[id] 119 } 120 return nil 121 } 122 getKids := func(id int) []int { 123 if props := getObjProps(id); props != nil { 124 if pages, kid := props.getPropByKey("Pages"), props.getPropByKey("Kids"); pages != nil && kid != nil { 125 kidIDs, _, _ := kid.asDictionaryArr() 126 return kidIDs 127 } 128 } 129 return nil 130 } 131 isPage := func(id int) bool { 132 if props := getObjProps(id); props != nil { 133 return props.getPropByKey("Page") != nil 134 } 135 return false 136 } 137 var visit func(id int) // Preorder Traversal, supporting for page nesting 138 visit = func(id int) { 139 if kids := getKids(id); kids != nil { 140 for _, kid := range kids { 141 visit(kid) 142 } 143 } else if isPage(id) { 144 results = append(results, id) 145 } 146 } 147 visit(rootPagesID) 148 return results, nil 149 } 150 151 func (p *PDFData) maxID() int { 152 max := 0 153 for _, id := range p.objIDs { 154 if id > max { 155 max = id 156 } 157 } 158 return max 159 } 160 161 func (p *PDFData) injectImgsToPDF(pdfImgs []*PDFImageData) error { 162 var err error 163 isEmbedResources := false 164 rootOfXObjectID := -1 165 resourcesContent := "" 166 cwRes, _ := p.getPageCrawl(p.trailer.rootObjID, "Kids", "Resources") 167 if err != nil { 168 return err 169 } 170 foundRes := false 171 for resID, r := range cwRes.results { 172 resources, err := r.valOf("Resources") 173 if err == ErrCrawlResultValOfNotFound { 174 continue 175 } else if err != nil { 176 return err 177 } else { 178 foundRes = true 179 resourcesID, _, err := readObjIDFromDictionary(resources) 180 if err == ErrorObjectIDNotFound { 181 rootOfXObjectID = resID 182 resourcesContent = resources 183 isEmbedResources = true 184 } else if err != nil { 185 return err 186 } else { 187 rootOfXObjectID = resourcesID 188 data := p.getObjByID(resourcesID) 189 if data != nil { 190 resourcesContent = string(data.data) 191 } 192 isEmbedResources = false 193 } 194 break 195 } 196 } 197 198 if !foundRes { 199 return errors.New("not found /Resources in /Type/Pages") 200 } 201 202 var cw crawl 203 //cw.set(p, p.trailer.rootObjID, "Pages", "Kids", "Resources", "XObject") 204 cw.set(p, rootOfXObjectID, "XObject") 205 err = cw.run() 206 if err != nil { 207 return err 208 } 209 210 found := false 211 xObjectVals := make(map[int]string) 212 for objID, r := range cw.results { 213 xobject, err := r.valOf("XObject") 214 if err == ErrCrawlResultValOfNotFound { 215 continue 216 } else if err != nil { 217 return err 218 } else { 219 xObjectVals[objID] = xobject 220 found = true 221 } 222 } 223 224 if !found { //ถ้ายังไม่เจออีก 225 cw2, _ := p.getPageCrawl(p.trailer.rootObjID, "Kids", "Resources", "XObject") 226 cw = *cw2 227 if err != nil { 228 return err 229 } 230 for objID, r := range cw.results { 231 xobject, err := r.valOf("XObject") 232 if err == ErrCrawlResultValOfNotFound { 233 continue 234 } else if err != nil { 235 return err 236 } else { 237 xObjectVals[objID] = xobject 238 found = true 239 } 240 } 241 } 242 243 var xobjs crawlResultXObjects 244 var xObjIndex int 245 xObjChar := "I" 246 if found { 247 for _, xObjectVal := range xObjectVals { 248 propVal := []byte(xObjectVal) 249 xobjs.parse(&propVal) 250 if len(xobjs) > 0 { 251 xObjChar = xobjs[len(xobjs)-1].xObjChar 252 if xobjs[len(xobjs)-1].xObjIndex > xObjIndex { 253 xObjIndex = xobjs[len(xobjs)-1].xObjIndex 254 } 255 } 256 } 257 } 258 259 i := 0 260 max := len(pdfImgs) 261 for i < max { 262 objID := pdfImgs[i].objID 263 pdfImgs[i].xObjChar = xObjChar 264 pdfImgs[i].xObjIndex = xObjIndex + i + 1 265 266 var xobj crawlResultXObject 267 xobj.xObjChar = xObjChar 268 xobj.xObjIndex = xObjIndex + i + 1 269 xobj.xObjObjID = objID 270 xobjs = append(xobjs, xobj) 271 i++ 272 } 273 274 objMustReplaces := make(map[int]string) 275 if found { 276 for objID, r := range cw.results { 277 var oldXObjectStr string 278 oldXObjectStr, err = r.valOf("XObject") 279 if err == ErrCrawlResultValOfNotFound { 280 continue 281 } else if err != nil { 282 return err 283 } 284 var newXObjs crawlResultXObjects 285 bOldXObjectStr := []byte(oldXObjectStr) 286 newXObjs.parse(&bOldXObjectStr) 287 for _, xobj := range xobjs { // pick new item from xobjs into newXObjs 288 isExisted := false 289 for _, existedXObj := range newXObjs { 290 if existedXObj.xObjChar == xobj.xObjChar && existedXObj.xObjIndex == xobj.xObjIndex { // Avoid conflict of same xObjIndex when editing emerged pdf 291 isExisted = true 292 break 293 } 294 } 295 if !isExisted { 296 newXObjs = append(newXObjs, xobj) 297 } 298 } 299 r.setValOf("XObject", fmt.Sprintf("<<%s>>\n", newXObjs.String())) 300 objMustReplaces[objID] = r.String() 301 } 302 } else { 303 if isEmbedResources { 304 var cw01 crawl 305 cw01.set(p, p.trailer.rootObjID, "Pages", "Kids", "Resources") 306 err = cw01.run() 307 if err != nil { 308 return err 309 } 310 for objID, r := range cw01.results { 311 res, err := r.valOf("Resources") 312 if err == ErrCrawlResultValOfNotFound { 313 continue 314 } else if err != nil { 315 return err 316 } else { 317 res = strings.TrimSpace(res) 318 res = fmt.Sprintf("%s /XObject <<%s>>", res[2:len(res)-2], xobjs.String()) 319 r.setValOf("Resources", fmt.Sprintf("<<%s>>\n", res)) 320 objMustReplaces[objID] = r.String() 321 } 322 } 323 } else { 324 for objID, r := range cw.results { 325 res := strings.TrimSpace(resourcesContent) 326 res = fmt.Sprintf("<<%s>>\n", xobjs.String()) 327 r.add("XObject", res) 328 objMustReplaces[objID] = r.String() 329 //fmt.Printf("%s\n", r.String()) 330 } 331 } 332 } 333 334 for objID := range objMustReplaces { 335 p.getObjByID(objID).data = []byte("<<\n" + objMustReplaces[objID] + ">>\n") 336 } 337 338 return nil 339 } 340 341 func (p *PDFData) injectFontsToPDF(fontDatas map[string]*PDFFontData) error { 342 var err error 343 cw, _ := p.getPageCrawl(p.trailer.rootObjID, "Kids", "Resources", "Font") 344 if err != nil { 345 return err 346 } 347 348 maxFontIndex, err := findMaxFontIndex(cw, p) 349 if err != nil { 350 return err 351 } 352 353 var newCrFonts crawlResultFonts //font ใหม่ที่จะยัดเข้าไป 354 for _, pdffontdata := range fontDatas { 355 maxFontIndex++ 356 newCrFonts.append(maxFontIndex, pdffontdata.fontID) 357 pdffontdata.setFontIndex(maxFontIndex) 358 } 359 360 objMustReplaces := make(map[int]string) 361 //หา obj ที่ต้องยัด font ใหม่ลงไป 362 for objID, r := range cw.results { //วน แต่ละ ojb 363 fontPropVal, err := r.valOf("Font") 364 if err == ErrCrawlResultValOfNotFound { 365 continue 366 } else if err != nil { 367 return err 368 } 369 370 fontPropValType := propertyType(fontPropVal) 371 if fontPropValType == object { 372 var crFonts crawlResultFonts 373 tmp := []byte(fontPropVal) 374 err = crFonts.parse(&tmp) 375 if err != nil { 376 return err 377 } 378 crFonts = append(crFonts, newCrFonts...) 379 r.setValOf("Font", "<<\n"+crFonts.String()+">>\n") 380 objMustReplaces[objID] = r.String() 381 } else if fontPropValType == dictionary { 382 var fontObjID int 383 fontObjID, _, err = readObjIDFromDictionary(fontPropVal) 384 if err != nil { 385 return err 386 } 387 var crFonts crawlResultFonts 388 fontObj := p.getObjByID(fontObjID) 389 err = crFonts.parse(&fontObj.data) 390 if err != nil { 391 return err 392 } 393 crFonts = append(crFonts, newCrFonts...) 394 objMustReplaces[fontObjID] = crFonts.String() 395 } 396 } 397 398 for objID := range objMustReplaces { 399 p.getObjByID(objID).data = []byte("<<\n" + objMustReplaces[objID] + ">>\n") 400 } 401 402 return nil 403 } 404 405 func (p *PDFData) injectContentToPDF(contenters *[]Contenter) error { 406 407 var err error 408 pageBuffs := make(map[int]*bytes.Buffer) 409 for _, ctn := range *contenters { 410 pageNum := ctn.page() 411 if _, ok := pageBuffs[pageNum]; !ok { 412 pageBuffs[pageNum] = new(bytes.Buffer) 413 } 414 var buff *bytes.Buffer 415 buff, err = ctn.toSteram() 416 if err != nil { 417 return err 418 } 419 420 //fmt.Printf("buff=%s\n\n", buff.String()) 421 422 _, err = buff.WriteTo(pageBuffs[pageNum]) 423 if err != nil { 424 return err 425 } 426 } 427 pageObjIDs, _ := p.getPageObjIDs() 428 objMustReplaces := make(map[int]string) 429 for pageIndex, pageObjID := range pageObjIDs { 430 431 var cw2Content crawl 432 cw2Content.set(p, pageObjID, "Contents") 433 err = cw2Content.run() 434 if err != nil { 435 return err 436 } 437 438 for _, r := range cw2Content.results { 439 440 //fmt.Printf("%s\n\n", r.String()) 441 442 var propContentsVal string 443 // fmt.Printf("id=%d\n", id) 444 propContentsVal, err = r.valOf("Contents") 445 // fmt.Printf("%d propContentsVal=%s\n\n", 0, r.String()) 446 if err == ErrCrawlResultValOfNotFound { 447 continue 448 } 449 450 propContentsValType := propertyType(propContentsVal) 451 /*if propContentsValType != dictionary { 452 return errors.New("not support /Contents type not dictionary yet") 453 }*/ 454 var contentsObjID int 455 if propContentsValType == dictionary { 456 contentsObjID, _, err = readObjIDFromDictionary(propContentsVal) 457 if err != nil { 458 return err 459 } 460 } else if propContentsValType == array { 461 contentsObjIDs, _, err := readObjIDFromDictionaryArr(propContentsVal) 462 if err != nil || len(contentsObjIDs) <= 0 { 463 return err 464 } 465 contentsObjID = contentsObjIDs[0] 466 } else { 467 return errors.New("not support /Contents type not dictionary,array yet") 468 } 469 470 data := &p.getObjByID(contentsObjID).data 471 zip := true 472 propContentsObj, err := readProperty(data, "FlateDecode") 473 if err != nil { 474 return err 475 } 476 if propContentsObj == nil { 477 zip = false 478 } 479 480 var stm *bytes.Buffer 481 //fmt.Printf("\n-------------------%d-----------------------\n%s\n\n", contentsObjID, string(*data)) 482 stmLen, err := streamLength(p, data) 483 if err != nil { 484 return err 485 } 486 487 stm, err = extractStream(data, stmLen, zip) 488 if err != nil { 489 return err 490 } 491 //fmt.Printf("stm=%s\n\n", stm.String()) 492 493 if _, ok := pageBuffs[pageIndex+1]; ok { 494 stm.WriteString("\n") 495 pageBuffs[pageIndex+1].WriteTo(stm) 496 objMustReplaces[contentsObjID] = fmt.Sprintf("<<\n/Length %d\n>>\nstream\n%sendstream", stm.Len(), stm.String()) 497 } 498 499 } 500 } 501 502 for objID := range objMustReplaces { 503 //_ = objID 504 //fmt.Printf("objID=%d\n", objID) 505 p.getObjByID(objID).data = []byte("" + objMustReplaces[objID] + "") 506 //fmt.Printf("objId=%d %s\n", objID, string(p.getObjByID(objID).data)) 507 } 508 509 return nil 510 } 511 512 func streamLength(p *PDFData, data *[]byte) (int, error) { 513 514 prop, err := readProperty(data, "Length") 515 if err != nil { 516 return 0, err 517 } 518 if prop == nil { 519 prop, err = readProperty(data, "Length1") 520 if err != nil { 521 return 0, err 522 } 523 if prop == nil { 524 return 0, errors.New("/Length or /Length1 not found") 525 } 526 } 527 528 propType := prop.valType() 529 if propType == number { 530 return strconv.Atoi(strings.TrimSpace(prop.rawVal)) 531 } else if propType == dictionary { 532 objID, _, err := prop.asDictionary() 533 if err != nil { 534 return 0, err 535 } 536 fontlengthObj := p.getObjByID(objID) 537 return strconv.Atoi(strings.TrimSpace(string(fontlengthObj.data))) 538 } else { 539 return 0, errors.New("/Length or /Length1 wrong type") 540 } 541 542 } 543 544 var extractStreamBytes = []byte{0x73, 0x74, 0x72, 0x65, 0x61, 0x6D} 545 546 func extractStream(b *[]byte, length int, zip bool) (*bytes.Buffer, error) { 547 548 index := bytes.Index(*b, extractStreamBytes) 549 offset := len(extractStreamBytes) 550 tmp := (*b)[index+offset:] 551 tmp = bytes.TrimSpace(tmp) 552 tmp = tmp[0:length] 553 var buff bytes.Buffer 554 buff.Write(tmp) 555 if !zip { 556 return &buff, nil 557 } 558 r, err := zlib.NewReader(&buff) 559 if err != nil { 560 return nil, err 561 } 562 defer r.Close() 563 var out bytes.Buffer 564 _, err = io.Copy(&out, r) 565 if err != nil { 566 return nil, err 567 } 568 return &out, nil 569 } 570 571 func findMaxFontIndex(cw *crawl, p *PDFData) (int, error) { 572 //find max font index 573 max := 0 574 for _, item := range cw.results { 575 fontPropVal, err := item.valOf("Font") 576 if err == ErrCrawlResultValOfNotFound { 577 continue 578 } else if err != nil { 579 return 0, err 580 } 581 582 var crFonts crawlResultFonts 583 fontPropValType := propertyType(fontPropVal) 584 if fontPropValType == object { 585 tmp := []byte(fontPropVal) 586 err = crFonts.parse(&tmp) 587 if err != nil { 588 return 0, err 589 } 590 //fmt.Printf("%#v\n", crFonts) 591 } else if fontPropValType == dictionary { 592 var fontObjID int 593 fontObjID, _, err = readObjIDFromDictionary(fontPropVal) 594 if err != nil { 595 return 0, err 596 } 597 fontObj := p.getObjByID(fontObjID) 598 err = crFonts.parse(&fontObj.data) 599 if err != nil { 600 return 0, err 601 } 602 //fmt.Printf("%#v\n", crFonts) 603 } else { 604 return 0, errors.New("not support /Font type array yet") 605 } 606 607 maxFontIndex := crFonts.maxFontIndex() 608 if maxFontIndex > max { 609 max = maxFontIndex 610 } 611 } 612 613 return max, nil 614 } 615 616 func objIDFromStartObjLine(line string) (int, error) { 617 tokens := strings.Fields(line) 618 if len(tokens) < 3 { 619 return 0, errors.New("bad start obj") 620 } 621 id, err := strconv.Atoi(strings.TrimSpace(tokens[0])) 622 if err != nil { 623 return 0, err 624 } 625 return id, nil 626 }