github.com/signintech/pdft@v0.5.0/pdf_data.go (about)

     1  package pdft
     2  
     3  import (
     4  	"bytes"
     5  	"compress/zlib"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"strconv"
    10  	"strings"
    11  )
    12  
    13  // PDFData pdf file data
    14  type PDFData struct {
    15  	trailer  TrailerData
    16  	xrefs    []XrefData
    17  	objIDs   []int
    18  	objs     []PDFObjData
    19  	pagesObj *PDFObjData
    20  }
    21  
    22  // TrailerData trailer
    23  type TrailerData struct {
    24  	rootObjID int
    25  }
    26  
    27  // Len count
    28  func (p *PDFData) Len() int {
    29  	return len(p.objIDs)
    30  }
    31  
    32  func (p *PDFData) put(pdfobj PDFObjData) {
    33  	p.objIDs = append(p.objIDs, pdfobj.objID)
    34  	p.objs = append(p.objs, pdfobj)
    35  }
    36  
    37  func (p *PDFData) putNewObject(pdfobj PDFObjData) int {
    38  	newObjID := p.maxID() + 1
    39  	pdfobj.objID = newObjID
    40  	p.put(pdfobj)
    41  	return newObjID
    42  }
    43  
    44  func (p *PDFData) removeObjByID(objID int) error {
    45  	for i, id := range p.objIDs {
    46  		if id == objID {
    47  			p.objIDs = append(p.objIDs[:i], p.objIDs[i+1:]...)
    48  			p.objs = append(p.objs[:i], p.objs[i+1:]...)
    49  			return nil
    50  		}
    51  	}
    52  	return errors.New("Not Found")
    53  }
    54  
    55  // GetObjByID get obj by objid
    56  func (p *PDFData) getObjByID(objID int) *PDFObjData {
    57  	// if pdf exists annotations, it will have multiple same objIDs. So, need find the right one.
    58  	indexArr := []int{}
    59  	for i, id := range p.objIDs {
    60  		if id == objID {
    61  			indexArr = append(indexArr, i)
    62  		}
    63  	}
    64  	if len(indexArr) == 1 {
    65  		return &p.objs[indexArr[0]]
    66  	} else if len(indexArr) > 1 {
    67  		result := &p.objs[indexArr[0]]
    68  		for _, i := range indexArr {
    69  			if props, err := (&p.objs[i]).readProperties(); err == nil && props.getPropByKey("Annots") != nil {
    70  				result = &p.objs[i]
    71  			}
    72  		}
    73  		return result
    74  	}
    75  	return nil
    76  }
    77  
    78  // getPageCrawl use crawl, supporting for page nesting
    79  func (p *PDFData) getPageCrawl(objID int, path ...string) (*crawl, error) {
    80  	var cw crawl
    81  	pagePath := append([]string{"Pages"}, path...)
    82  	cw.set(p, objID, pagePath...)
    83  	cw.run()
    84  	checkedQueue := []int{}
    85  	for k := range cw.results {
    86  		checkedQueue = append(checkedQueue, k)
    87  	}
    88  	for len(checkedQueue) > 0 {
    89  		key := checkedQueue[0]
    90  		if s := cw.results[key].String(); strings.Contains(s, "/Pages") && strings.Contains(s, "/Parent") {
    91  			var subCw crawl
    92  			subCw.set(p, key, path...)
    93  			subCw.run()
    94  			for k, v := range subCw.results {
    95  				cw.results[k] = v
    96  				if _, ok := cw.results[k]; !ok {
    97  					checkedQueue = append(checkedQueue, k)
    98  				}
    99  			}
   100  		}
   101  		checkedQueue = checkedQueue[1:]
   102  	}
   103  	return &cw, nil
   104  }
   105  
   106  // getPageObjIDs get page obj IDs
   107  func (p *PDFData) getPageObjIDs() ([]int, error) {
   108  	results := []int{}
   109  	rootProps, _ := p.getObjByID(p.trailer.rootObjID).readProperties()
   110  	rootPagesID, _, _ := rootProps.getPropByKey("Pages").asDictionary()
   111  	objProps := map[int]*PDFObjPropertiesData{} // cache props
   112  	getObjProps := func(id int) *PDFObjPropertiesData {
   113  		if v, ok := objProps[id]; ok {
   114  			return v
   115  		}
   116  		if data, err := p.getObjByID(id).readProperties(); err == nil {
   117  			objProps[id] = data
   118  			return objProps[id]
   119  		}
   120  		return nil
   121  	}
   122  	getKids := func(id int) []int {
   123  		if props := getObjProps(id); props != nil {
   124  			if pages, kid := props.getPropByKey("Pages"), props.getPropByKey("Kids"); pages != nil && kid != nil {
   125  				kidIDs, _, _ := kid.asDictionaryArr()
   126  				return kidIDs
   127  			}
   128  		}
   129  		return nil
   130  	}
   131  	isPage := func(id int) bool {
   132  		if props := getObjProps(id); props != nil {
   133  			return props.getPropByKey("Page") != nil
   134  		}
   135  		return false
   136  	}
   137  	var visit func(id int) // Preorder Traversal, supporting for page nesting
   138  	visit = func(id int) {
   139  		if kids := getKids(id); kids != nil {
   140  			for _, kid := range kids {
   141  				visit(kid)
   142  			}
   143  		} else if isPage(id) {
   144  			results = append(results, id)
   145  		}
   146  	}
   147  	visit(rootPagesID)
   148  	return results, nil
   149  }
   150  
   151  func (p *PDFData) maxID() int {
   152  	max := 0
   153  	for _, id := range p.objIDs {
   154  		if id > max {
   155  			max = id
   156  		}
   157  	}
   158  	return max
   159  }
   160  
   161  func (p *PDFData) injectImgsToPDF(pdfImgs []*PDFImageData) error {
   162  	var err error
   163  	isEmbedResources := false
   164  	rootOfXObjectID := -1
   165  	resourcesContent := ""
   166  	cwRes, _ := p.getPageCrawl(p.trailer.rootObjID, "Kids", "Resources")
   167  	if err != nil {
   168  		return err
   169  	}
   170  	foundRes := false
   171  	for resID, r := range cwRes.results {
   172  		resources, err := r.valOf("Resources")
   173  		if err == ErrCrawlResultValOfNotFound {
   174  			continue
   175  		} else if err != nil {
   176  			return err
   177  		} else {
   178  			foundRes = true
   179  			resourcesID, _, err := readObjIDFromDictionary(resources)
   180  			if err == ErrorObjectIDNotFound {
   181  				rootOfXObjectID = resID
   182  				resourcesContent = resources
   183  				isEmbedResources = true
   184  			} else if err != nil {
   185  				return err
   186  			} else {
   187  				rootOfXObjectID = resourcesID
   188  				data := p.getObjByID(resourcesID)
   189  				if data != nil {
   190  					resourcesContent = string(data.data)
   191  				}
   192  				isEmbedResources = false
   193  			}
   194  			break
   195  		}
   196  	}
   197  
   198  	if !foundRes {
   199  		return errors.New("not found /Resources in /Type/Pages")
   200  	}
   201  
   202  	var cw crawl
   203  	//cw.set(p, p.trailer.rootObjID, "Pages", "Kids", "Resources", "XObject")
   204  	cw.set(p, rootOfXObjectID, "XObject")
   205  	err = cw.run()
   206  	if err != nil {
   207  		return err
   208  	}
   209  
   210  	found := false
   211  	xObjectVals := make(map[int]string)
   212  	for objID, r := range cw.results {
   213  		xobject, err := r.valOf("XObject")
   214  		if err == ErrCrawlResultValOfNotFound {
   215  			continue
   216  		} else if err != nil {
   217  			return err
   218  		} else {
   219  			xObjectVals[objID] = xobject
   220  			found = true
   221  		}
   222  	}
   223  
   224  	if !found { //ถ้ายังไม่เจออีก
   225  		cw2, _ := p.getPageCrawl(p.trailer.rootObjID, "Kids", "Resources", "XObject")
   226  		cw = *cw2
   227  		if err != nil {
   228  			return err
   229  		}
   230  		for objID, r := range cw.results {
   231  			xobject, err := r.valOf("XObject")
   232  			if err == ErrCrawlResultValOfNotFound {
   233  				continue
   234  			} else if err != nil {
   235  				return err
   236  			} else {
   237  				xObjectVals[objID] = xobject
   238  				found = true
   239  			}
   240  		}
   241  	}
   242  
   243  	var xobjs crawlResultXObjects
   244  	var xObjIndex int
   245  	xObjChar := "I"
   246  	if found {
   247  		for _, xObjectVal := range xObjectVals {
   248  			propVal := []byte(xObjectVal)
   249  			xobjs.parse(&propVal)
   250  			if len(xobjs) > 0 {
   251  				xObjChar = xobjs[len(xobjs)-1].xObjChar
   252  				if xobjs[len(xobjs)-1].xObjIndex > xObjIndex {
   253  					xObjIndex = xobjs[len(xobjs)-1].xObjIndex
   254  				}
   255  			}
   256  		}
   257  	}
   258  
   259  	i := 0
   260  	max := len(pdfImgs)
   261  	for i < max {
   262  		objID := pdfImgs[i].objID
   263  		pdfImgs[i].xObjChar = xObjChar
   264  		pdfImgs[i].xObjIndex = xObjIndex + i + 1
   265  
   266  		var xobj crawlResultXObject
   267  		xobj.xObjChar = xObjChar
   268  		xobj.xObjIndex = xObjIndex + i + 1
   269  		xobj.xObjObjID = objID
   270  		xobjs = append(xobjs, xobj)
   271  		i++
   272  	}
   273  
   274  	objMustReplaces := make(map[int]string)
   275  	if found {
   276  		for objID, r := range cw.results {
   277  			var oldXObjectStr string
   278  			oldXObjectStr, err = r.valOf("XObject")
   279  			if err == ErrCrawlResultValOfNotFound {
   280  				continue
   281  			} else if err != nil {
   282  				return err
   283  			}
   284  			var newXObjs crawlResultXObjects
   285  			bOldXObjectStr := []byte(oldXObjectStr)
   286  			newXObjs.parse(&bOldXObjectStr)
   287  			for _, xobj := range xobjs { // pick new item from xobjs into newXObjs
   288  				isExisted := false
   289  				for _, existedXObj := range newXObjs {
   290  					if existedXObj.xObjChar == xobj.xObjChar && existedXObj.xObjIndex == xobj.xObjIndex { // Avoid conflict of same xObjIndex when editing emerged pdf
   291  						isExisted = true
   292  						break
   293  					}
   294  				}
   295  				if !isExisted {
   296  					newXObjs = append(newXObjs, xobj)
   297  				}
   298  			}
   299  			r.setValOf("XObject", fmt.Sprintf("<<%s>>\n", newXObjs.String()))
   300  			objMustReplaces[objID] = r.String()
   301  		}
   302  	} else {
   303  		if isEmbedResources {
   304  			var cw01 crawl
   305  			cw01.set(p, p.trailer.rootObjID, "Pages", "Kids", "Resources")
   306  			err = cw01.run()
   307  			if err != nil {
   308  				return err
   309  			}
   310  			for objID, r := range cw01.results {
   311  				res, err := r.valOf("Resources")
   312  				if err == ErrCrawlResultValOfNotFound {
   313  					continue
   314  				} else if err != nil {
   315  					return err
   316  				} else {
   317  					res = strings.TrimSpace(res)
   318  					res = fmt.Sprintf("%s /XObject <<%s>>", res[2:len(res)-2], xobjs.String())
   319  					r.setValOf("Resources", fmt.Sprintf("<<%s>>\n", res))
   320  					objMustReplaces[objID] = r.String()
   321  				}
   322  			}
   323  		} else {
   324  			for objID, r := range cw.results {
   325  				res := strings.TrimSpace(resourcesContent)
   326  				res = fmt.Sprintf("<<%s>>\n", xobjs.String())
   327  				r.add("XObject", res)
   328  				objMustReplaces[objID] = r.String()
   329  				//fmt.Printf("%s\n", r.String())
   330  			}
   331  		}
   332  	}
   333  
   334  	for objID := range objMustReplaces {
   335  		p.getObjByID(objID).data = []byte("<<\n" + objMustReplaces[objID] + ">>\n")
   336  	}
   337  
   338  	return nil
   339  }
   340  
   341  func (p *PDFData) injectFontsToPDF(fontDatas map[string]*PDFFontData) error {
   342  	var err error
   343  	cw, _ := p.getPageCrawl(p.trailer.rootObjID, "Kids", "Resources", "Font")
   344  	if err != nil {
   345  		return err
   346  	}
   347  
   348  	maxFontIndex, err := findMaxFontIndex(cw, p)
   349  	if err != nil {
   350  		return err
   351  	}
   352  
   353  	var newCrFonts crawlResultFonts //font ใหม่ที่จะยัดเข้าไป
   354  	for _, pdffontdata := range fontDatas {
   355  		maxFontIndex++
   356  		newCrFonts.append(maxFontIndex, pdffontdata.fontID)
   357  		pdffontdata.setFontIndex(maxFontIndex)
   358  	}
   359  
   360  	objMustReplaces := make(map[int]string)
   361  	//หา obj ที่ต้องยัด font ใหม่ลงไป
   362  	for objID, r := range cw.results { //วน แต่ละ ojb
   363  		fontPropVal, err := r.valOf("Font")
   364  		if err == ErrCrawlResultValOfNotFound {
   365  			continue
   366  		} else if err != nil {
   367  			return err
   368  		}
   369  
   370  		fontPropValType := propertyType(fontPropVal)
   371  		if fontPropValType == object {
   372  			var crFonts crawlResultFonts
   373  			tmp := []byte(fontPropVal)
   374  			err = crFonts.parse(&tmp)
   375  			if err != nil {
   376  				return err
   377  			}
   378  			crFonts = append(crFonts, newCrFonts...)
   379  			r.setValOf("Font", "<<\n"+crFonts.String()+">>\n")
   380  			objMustReplaces[objID] = r.String()
   381  		} else if fontPropValType == dictionary {
   382  			var fontObjID int
   383  			fontObjID, _, err = readObjIDFromDictionary(fontPropVal)
   384  			if err != nil {
   385  				return err
   386  			}
   387  			var crFonts crawlResultFonts
   388  			fontObj := p.getObjByID(fontObjID)
   389  			err = crFonts.parse(&fontObj.data)
   390  			if err != nil {
   391  				return err
   392  			}
   393  			crFonts = append(crFonts, newCrFonts...)
   394  			objMustReplaces[fontObjID] = crFonts.String()
   395  		}
   396  	}
   397  
   398  	for objID := range objMustReplaces {
   399  		p.getObjByID(objID).data = []byte("<<\n" + objMustReplaces[objID] + ">>\n")
   400  	}
   401  
   402  	return nil
   403  }
   404  
   405  func (p *PDFData) injectContentToPDF(contenters *[]Contenter) error {
   406  
   407  	var err error
   408  	pageBuffs := make(map[int]*bytes.Buffer)
   409  	for _, ctn := range *contenters {
   410  		pageNum := ctn.page()
   411  		if _, ok := pageBuffs[pageNum]; !ok {
   412  			pageBuffs[pageNum] = new(bytes.Buffer)
   413  		}
   414  		var buff *bytes.Buffer
   415  		buff, err = ctn.toSteram()
   416  		if err != nil {
   417  			return err
   418  		}
   419  
   420  		//fmt.Printf("buff=%s\n\n", buff.String())
   421  
   422  		_, err = buff.WriteTo(pageBuffs[pageNum])
   423  		if err != nil {
   424  			return err
   425  		}
   426  	}
   427  	pageObjIDs, _ := p.getPageObjIDs()
   428  	objMustReplaces := make(map[int]string)
   429  	for pageIndex, pageObjID := range pageObjIDs {
   430  
   431  		var cw2Content crawl
   432  		cw2Content.set(p, pageObjID, "Contents")
   433  		err = cw2Content.run()
   434  		if err != nil {
   435  			return err
   436  		}
   437  
   438  		for _, r := range cw2Content.results {
   439  
   440  			//fmt.Printf("%s\n\n", r.String())
   441  
   442  			var propContentsVal string
   443  			// fmt.Printf("id=%d\n", id)
   444  			propContentsVal, err = r.valOf("Contents")
   445  			// fmt.Printf("%d propContentsVal=%s\n\n", 0, r.String())
   446  			if err == ErrCrawlResultValOfNotFound {
   447  				continue
   448  			}
   449  
   450  			propContentsValType := propertyType(propContentsVal)
   451  			/*if propContentsValType != dictionary {
   452  				return errors.New("not support /Contents type not dictionary yet")
   453  			}*/
   454  			var contentsObjID int
   455  			if propContentsValType == dictionary {
   456  				contentsObjID, _, err = readObjIDFromDictionary(propContentsVal)
   457  				if err != nil {
   458  					return err
   459  				}
   460  			} else if propContentsValType == array {
   461  				contentsObjIDs, _, err := readObjIDFromDictionaryArr(propContentsVal)
   462  				if err != nil || len(contentsObjIDs) <= 0 {
   463  					return err
   464  				}
   465  				contentsObjID = contentsObjIDs[0]
   466  			} else {
   467  				return errors.New("not support /Contents type not dictionary,array yet")
   468  			}
   469  
   470  			data := &p.getObjByID(contentsObjID).data
   471  			zip := true
   472  			propContentsObj, err := readProperty(data, "FlateDecode")
   473  			if err != nil {
   474  				return err
   475  			}
   476  			if propContentsObj == nil {
   477  				zip = false
   478  			}
   479  
   480  			var stm *bytes.Buffer
   481  			//fmt.Printf("\n-------------------%d-----------------------\n%s\n\n", contentsObjID, string(*data))
   482  			stmLen, err := streamLength(p, data)
   483  			if err != nil {
   484  				return err
   485  			}
   486  
   487  			stm, err = extractStream(data, stmLen, zip)
   488  			if err != nil {
   489  				return err
   490  			}
   491  			//fmt.Printf("stm=%s\n\n", stm.String())
   492  
   493  			if _, ok := pageBuffs[pageIndex+1]; ok {
   494  				stm.WriteString("\n")
   495  				pageBuffs[pageIndex+1].WriteTo(stm)
   496  				objMustReplaces[contentsObjID] = fmt.Sprintf("<<\n/Length %d\n>>\nstream\n%sendstream", stm.Len(), stm.String())
   497  			}
   498  
   499  		}
   500  	}
   501  
   502  	for objID := range objMustReplaces {
   503  		//_ = objID
   504  		//fmt.Printf("objID=%d\n", objID)
   505  		p.getObjByID(objID).data = []byte("" + objMustReplaces[objID] + "")
   506  		//fmt.Printf("objId=%d %s\n", objID, string(p.getObjByID(objID).data))
   507  	}
   508  
   509  	return nil
   510  }
   511  
   512  func streamLength(p *PDFData, data *[]byte) (int, error) {
   513  
   514  	prop, err := readProperty(data, "Length")
   515  	if err != nil {
   516  		return 0, err
   517  	}
   518  	if prop == nil {
   519  		prop, err = readProperty(data, "Length1")
   520  		if err != nil {
   521  			return 0, err
   522  		}
   523  		if prop == nil {
   524  			return 0, errors.New("/Length or /Length1 not found")
   525  		}
   526  	}
   527  
   528  	propType := prop.valType()
   529  	if propType == number {
   530  		return strconv.Atoi(strings.TrimSpace(prop.rawVal))
   531  	} else if propType == dictionary {
   532  		objID, _, err := prop.asDictionary()
   533  		if err != nil {
   534  			return 0, err
   535  		}
   536  		fontlengthObj := p.getObjByID(objID)
   537  		return strconv.Atoi(strings.TrimSpace(string(fontlengthObj.data)))
   538  	} else {
   539  		return 0, errors.New("/Length or /Length1  wrong type")
   540  	}
   541  
   542  }
   543  
   544  var extractStreamBytes = []byte{0x73, 0x74, 0x72, 0x65, 0x61, 0x6D}
   545  
   546  func extractStream(b *[]byte, length int, zip bool) (*bytes.Buffer, error) {
   547  
   548  	index := bytes.Index(*b, extractStreamBytes)
   549  	offset := len(extractStreamBytes)
   550  	tmp := (*b)[index+offset:]
   551  	tmp = bytes.TrimSpace(tmp)
   552  	tmp = tmp[0:length]
   553  	var buff bytes.Buffer
   554  	buff.Write(tmp)
   555  	if !zip {
   556  		return &buff, nil
   557  	}
   558  	r, err := zlib.NewReader(&buff)
   559  	if err != nil {
   560  		return nil, err
   561  	}
   562  	defer r.Close()
   563  	var out bytes.Buffer
   564  	_, err = io.Copy(&out, r)
   565  	if err != nil {
   566  		return nil, err
   567  	}
   568  	return &out, nil
   569  }
   570  
   571  func findMaxFontIndex(cw *crawl, p *PDFData) (int, error) {
   572  	//find max font index
   573  	max := 0
   574  	for _, item := range cw.results {
   575  		fontPropVal, err := item.valOf("Font")
   576  		if err == ErrCrawlResultValOfNotFound {
   577  			continue
   578  		} else if err != nil {
   579  			return 0, err
   580  		}
   581  
   582  		var crFonts crawlResultFonts
   583  		fontPropValType := propertyType(fontPropVal)
   584  		if fontPropValType == object {
   585  			tmp := []byte(fontPropVal)
   586  			err = crFonts.parse(&tmp)
   587  			if err != nil {
   588  				return 0, err
   589  			}
   590  			//fmt.Printf("%#v\n", crFonts)
   591  		} else if fontPropValType == dictionary {
   592  			var fontObjID int
   593  			fontObjID, _, err = readObjIDFromDictionary(fontPropVal)
   594  			if err != nil {
   595  				return 0, err
   596  			}
   597  			fontObj := p.getObjByID(fontObjID)
   598  			err = crFonts.parse(&fontObj.data)
   599  			if err != nil {
   600  				return 0, err
   601  			}
   602  			//fmt.Printf("%#v\n", crFonts)
   603  		} else {
   604  			return 0, errors.New("not support /Font type array yet")
   605  		}
   606  
   607  		maxFontIndex := crFonts.maxFontIndex()
   608  		if maxFontIndex > max {
   609  			max = maxFontIndex
   610  		}
   611  	}
   612  
   613  	return max, nil
   614  }
   615  
   616  func objIDFromStartObjLine(line string) (int, error) {
   617  	tokens := strings.Fields(line)
   618  	if len(tokens) < 3 {
   619  		return 0, errors.New("bad start obj")
   620  	}
   621  	id, err := strconv.Atoi(strings.TrimSpace(tokens[0]))
   622  	if err != nil {
   623  		return 0, err
   624  	}
   625  	return id, nil
   626  }