github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/model/reader.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package model
     7  
     8  import (
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"strings"
    13  
    14  	"github.com/unidoc/unidoc/common"
    15  	. "github.com/unidoc/unidoc/pdf/core"
    16  )
    17  
    18  // PdfReader represents a PDF file reader. It is a frontend to the lower level parsing mechanism and provides
    19  // a higher level access to work with PDF structure and information, such as the page structure etc.
    20  type PdfReader struct {
    21  	parser      *PdfParser
    22  	root        PdfObject
    23  	pages       *PdfObjectDictionary
    24  	pageList    []*PdfIndirectObject
    25  	PageList    []*PdfPage
    26  	pageCount   int
    27  	catalog     *PdfObjectDictionary
    28  	outlineTree *PdfOutlineTreeNode
    29  	AcroForm    *PdfAcroForm
    30  
    31  	modelManager *ModelManager
    32  
    33  	// For tracking traversal (cache).
    34  	traversed map[PdfObject]bool
    35  }
    36  
    37  // NewPdfReader returns a new PdfReader for an input io.ReadSeeker interface. Can be used to read PDF from
    38  // memory or file. Immediately loads and traverses the PDF structure including pages and page contents (if
    39  // not encrypted).
    40  func NewPdfReader(rs io.ReadSeeker) (*PdfReader, error) {
    41  	pdfReader := &PdfReader{}
    42  	pdfReader.traversed = map[PdfObject]bool{}
    43  
    44  	pdfReader.modelManager = NewModelManager()
    45  
    46  	// Create the parser, loads the cross reference table and trailer.
    47  	parser, err := NewParser(rs)
    48  	if err != nil {
    49  		return nil, err
    50  	}
    51  	pdfReader.parser = parser
    52  
    53  	isEncrypted, err := pdfReader.IsEncrypted()
    54  	if err != nil {
    55  		return nil, err
    56  	}
    57  
    58  	// Load pdf doc structure if not encrypted.
    59  	if !isEncrypted {
    60  		err = pdfReader.loadStructure()
    61  		if err != nil {
    62  			return nil, err
    63  		}
    64  	}
    65  
    66  	return pdfReader, nil
    67  }
    68  
    69  // IsEncrypted returns true if the PDF file is encrypted.
    70  func (this *PdfReader) IsEncrypted() (bool, error) {
    71  	return this.parser.IsEncrypted()
    72  }
    73  
    74  // GetEncryptionMethod returns a string containing some information about the encryption method used.
    75  // XXX/TODO: May be better to return a standardized struct with information.
    76  func (this *PdfReader) GetEncryptionMethod() string {
    77  	crypter := this.parser.GetCrypter()
    78  	str := crypter.Filter + " - "
    79  
    80  	if crypter.V == 0 {
    81  		str += "Undocumented algorithm"
    82  	} else if crypter.V == 1 {
    83  		// RC4 or AES (bits: 40)
    84  		str += "RC4: 40 bits"
    85  	} else if crypter.V == 2 {
    86  		str += fmt.Sprintf("RC4: %d bits", crypter.Length)
    87  	} else if crypter.V == 3 {
    88  		str += "Unpublished algorithm"
    89  	} else if crypter.V >= 4 {
    90  		// Look at CF, StmF, StrF
    91  		str += fmt.Sprintf("Stream filter: %s - String filter: %s", crypter.StreamFilter, crypter.StringFilter)
    92  		str += "; Crypt filters:"
    93  		for name, cf := range crypter.CryptFilters {
    94  			str += fmt.Sprintf(" - %s: %s (%d)", name, cf.Cfm, cf.Length)
    95  		}
    96  	}
    97  	perms := crypter.GetAccessPermissions()
    98  	str += fmt.Sprintf(" - %#v", perms)
    99  
   100  	return str
   101  }
   102  
   103  // Decrypt decrypts the PDF file with a specified password.  Also tries to
   104  // decrypt with an empty password.  Returns true if successful,
   105  // false otherwise.
   106  func (this *PdfReader) Decrypt(password []byte) (bool, error) {
   107  	success, err := this.parser.Decrypt(password)
   108  	if err != nil {
   109  		return false, err
   110  	}
   111  	if !success {
   112  		return false, nil
   113  	}
   114  
   115  	err = this.loadStructure()
   116  	if err != nil {
   117  		common.Log.Debug("ERROR: Fail to load structure (%s)", err)
   118  		return false, err
   119  	}
   120  
   121  	return true, nil
   122  }
   123  
   124  // CheckAccessRights checks access rights and permissions for a specified password.  If either user/owner
   125  // password is specified,  full rights are granted, otherwise the access rights are specified by the
   126  // Permissions flag.
   127  //
   128  // The bool flag indicates that the user can access and view the file.
   129  // The AccessPermissions shows what access the user has for editing etc.
   130  // An error is returned if there was a problem performing the authentication.
   131  func (this *PdfReader) CheckAccessRights(password []byte) (bool, AccessPermissions, error) {
   132  	return this.parser.CheckAccessRights(password)
   133  }
   134  
   135  // Loads the structure of the pdf file: pages, outlines, etc.
   136  func (this *PdfReader) loadStructure() error {
   137  	if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() {
   138  		return fmt.Errorf("File need to be decrypted first")
   139  	}
   140  
   141  	trailerDict := this.parser.GetTrailer()
   142  	if trailerDict == nil {
   143  		return fmt.Errorf("Missing trailer")
   144  	}
   145  
   146  	// Catalog.
   147  	root, ok := trailerDict.Get("Root").(*PdfObjectReference)
   148  	if !ok {
   149  		return fmt.Errorf("Invalid Root (trailer: %s)", *trailerDict)
   150  	}
   151  	oc, err := this.parser.LookupByReference(*root)
   152  	if err != nil {
   153  		common.Log.Debug("ERROR: Failed to read root element catalog: %s", err)
   154  		return err
   155  	}
   156  	pcatalog, ok := oc.(*PdfIndirectObject)
   157  	if !ok {
   158  		common.Log.Debug("ERROR: Missing catalog: (root %q) (trailer %s)", oc, *trailerDict)
   159  		return errors.New("Missing catalog")
   160  	}
   161  	catalog, ok := (*pcatalog).PdfObject.(*PdfObjectDictionary)
   162  	if !ok {
   163  		common.Log.Debug("ERROR: Invalid catalog (%s)", pcatalog.PdfObject)
   164  		return errors.New("Invalid catalog")
   165  	}
   166  	common.Log.Trace("Catalog: %s", catalog)
   167  
   168  	// Pages.
   169  	pagesRef, ok := catalog.Get("Pages").(*PdfObjectReference)
   170  	if !ok {
   171  		return errors.New("Pages in catalog should be a reference")
   172  	}
   173  	op, err := this.parser.LookupByReference(*pagesRef)
   174  	if err != nil {
   175  		common.Log.Debug("ERROR: Failed to read pages")
   176  		return err
   177  	}
   178  	ppages, ok := op.(*PdfIndirectObject)
   179  	if !ok {
   180  		common.Log.Debug("ERROR: Pages object invalid")
   181  		common.Log.Debug("op: %p", ppages)
   182  		return errors.New("Pages object invalid")
   183  	}
   184  	pages, ok := ppages.PdfObject.(*PdfObjectDictionary)
   185  	if !ok {
   186  		common.Log.Debug("ERROR: Pages object invalid (%s)", ppages)
   187  		return errors.New("Pages object invalid")
   188  	}
   189  	pageCount, ok := pages.Get("Count").(*PdfObjectInteger)
   190  	if !ok {
   191  		common.Log.Debug("ERROR: Pages count object invalid")
   192  		return errors.New("Pages count invalid")
   193  	}
   194  
   195  	this.root = root
   196  	this.catalog = catalog
   197  	this.pages = pages
   198  	this.pageCount = int(*pageCount)
   199  	this.pageList = []*PdfIndirectObject{}
   200  
   201  	traversedPageNodes := map[PdfObject]bool{}
   202  	err = this.buildPageList(ppages, nil, traversedPageNodes)
   203  	if err != nil {
   204  		return err
   205  	}
   206  	common.Log.Trace("---")
   207  	common.Log.Trace("TOC")
   208  	common.Log.Trace("Pages")
   209  	common.Log.Trace("%d: %s", len(this.pageList), this.pageList)
   210  
   211  	// Outlines.
   212  	this.outlineTree, err = this.loadOutlines()
   213  	if err != nil {
   214  		common.Log.Debug("ERROR: Failed to build outline tree (%s)", err)
   215  		return err
   216  	}
   217  
   218  	// Load interactive forms and fields.
   219  	this.AcroForm, err = this.loadForms()
   220  	if err != nil {
   221  		return err
   222  	}
   223  
   224  	return nil
   225  }
   226  
   227  // Trace to object.  Keeps a list of already visited references to avoid circular references.
   228  //
   229  // Example circular reference.
   230  // 1 0 obj << /Next 2 0 R >>
   231  // 2 0 obj << /Next 1 0 R >>
   232  //
   233  func (this *PdfReader) traceToObjectWrapper(obj PdfObject, refList map[*PdfObjectReference]bool) (PdfObject, error) {
   234  	// Keep a list of references to avoid circular references.
   235  
   236  	ref, isRef := obj.(*PdfObjectReference)
   237  	if isRef {
   238  		// Make sure not already visited (circular ref).
   239  		if _, alreadyTraversed := refList[ref]; alreadyTraversed {
   240  			return nil, errors.New("Circular reference")
   241  		}
   242  		refList[ref] = true
   243  		obj, err := this.parser.LookupByReference(*ref)
   244  		if err != nil {
   245  			return nil, err
   246  		}
   247  		return this.traceToObjectWrapper(obj, refList)
   248  	}
   249  
   250  	// Not a reference, an object.  Can be indirect or any direct pdf object (other than reference).
   251  	return obj, nil
   252  }
   253  
   254  func (this *PdfReader) traceToObject(obj PdfObject) (PdfObject, error) {
   255  	refList := map[*PdfObjectReference]bool{}
   256  	return this.traceToObjectWrapper(obj, refList)
   257  }
   258  
   259  func (this *PdfReader) loadOutlines() (*PdfOutlineTreeNode, error) {
   260  	if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() {
   261  		return nil, fmt.Errorf("File need to be decrypted first")
   262  	}
   263  
   264  	// Has outlines? Otherwise return an empty outlines structure.
   265  	catalog := this.catalog
   266  	outlinesObj := catalog.Get("Outlines")
   267  	if outlinesObj == nil {
   268  		return nil, nil
   269  	}
   270  
   271  	common.Log.Trace("-Has outlines")
   272  	// Trace references to the object.
   273  	outlineRootObj, err := this.traceToObject(outlinesObj)
   274  	if err != nil {
   275  		common.Log.Debug("ERROR: Failed to read outlines")
   276  		return nil, err
   277  	}
   278  	common.Log.Trace("Outline root: %v", outlineRootObj)
   279  
   280  	if _, isNull := outlineRootObj.(*PdfObjectNull); isNull {
   281  		common.Log.Trace("Outline root is null - no outlines")
   282  		return nil, nil
   283  	}
   284  
   285  	outlineRoot, ok := outlineRootObj.(*PdfIndirectObject)
   286  	if !ok {
   287  		return nil, errors.New("Outline root should be an indirect object")
   288  	}
   289  
   290  	dict, ok := outlineRoot.PdfObject.(*PdfObjectDictionary)
   291  	if !ok {
   292  		return nil, errors.New("Outline indirect object should contain a dictionary")
   293  	}
   294  
   295  	common.Log.Trace("Outline root dict: %v", dict)
   296  
   297  	outlineTree, _, err := this.buildOutlineTree(outlineRoot, nil, nil)
   298  	if err != nil {
   299  		return nil, err
   300  	}
   301  	common.Log.Trace("Resulting outline tree: %v", outlineTree)
   302  
   303  	return outlineTree, nil
   304  }
   305  
   306  // Recursive build outline tree.
   307  // prev PdfObject,
   308  // Input: The indirect object containing an Outlines or Outline item dictionary.
   309  // Parent, Prev are the parent or previous node in the hierarchy.
   310  // The function returns the corresponding tree node and the last node which is used
   311  // for setting the Last pointer of the tree node structures.
   312  func (this *PdfReader) buildOutlineTree(obj PdfObject, parent *PdfOutlineTreeNode, prev *PdfOutlineTreeNode) (*PdfOutlineTreeNode, *PdfOutlineTreeNode, error) {
   313  	container, isInd := obj.(*PdfIndirectObject)
   314  	if !isInd {
   315  		return nil, nil, fmt.Errorf("Outline container not an indirect object %T", obj)
   316  	}
   317  	dict, ok := container.PdfObject.(*PdfObjectDictionary)
   318  	if !ok {
   319  		return nil, nil, errors.New("Not a dictionary object")
   320  	}
   321  	common.Log.Trace("build outline tree: dict: %v (%v) p: %p", dict, container, container)
   322  
   323  	if obj := dict.Get("Title"); obj != nil {
   324  		// Outline item has a title. (required)
   325  		outlineItem, err := this.newPdfOutlineItemFromIndirectObject(container)
   326  		if err != nil {
   327  			return nil, nil, err
   328  		}
   329  		outlineItem.Parent = parent
   330  		outlineItem.Prev = prev
   331  
   332  		if firstObj := dict.Get("First"); firstObj != nil {
   333  			firstObj, err = this.traceToObject(firstObj)
   334  			if err != nil {
   335  				return nil, nil, err
   336  			}
   337  			if _, isNull := firstObj.(*PdfObjectNull); !isNull {
   338  				first, last, err := this.buildOutlineTree(firstObj, &outlineItem.PdfOutlineTreeNode, nil)
   339  				if err != nil {
   340  					return nil, nil, err
   341  				}
   342  				outlineItem.First = first
   343  				outlineItem.Last = last
   344  			}
   345  		}
   346  
   347  		// Resolve the reference to next
   348  		if nextObj := dict.Get("Next"); nextObj != nil {
   349  			nextObj, err = this.traceToObject(nextObj)
   350  			if err != nil {
   351  				return nil, nil, err
   352  			}
   353  			if _, isNull := nextObj.(*PdfObjectNull); !isNull {
   354  				next, last, err := this.buildOutlineTree(nextObj, parent, &outlineItem.PdfOutlineTreeNode)
   355  				if err != nil {
   356  					return nil, nil, err
   357  				}
   358  				outlineItem.Next = next
   359  				return &outlineItem.PdfOutlineTreeNode, last, nil
   360  			}
   361  		}
   362  
   363  		return &outlineItem.PdfOutlineTreeNode, &outlineItem.PdfOutlineTreeNode, nil
   364  	} else {
   365  		// Outline dictionary (structure element).
   366  
   367  		outline, err := newPdfOutlineFromIndirectObject(container)
   368  		if err != nil {
   369  			return nil, nil, err
   370  		}
   371  		outline.Parent = parent
   372  		//outline.Prev = parent
   373  
   374  		if firstObj := dict.Get("First"); firstObj != nil {
   375  			// Has children...
   376  			firstObj, err = this.traceToObject(firstObj)
   377  			if err != nil {
   378  				return nil, nil, err
   379  			}
   380  			if _, isNull := firstObj.(*PdfObjectNull); !isNull {
   381  				first, last, err := this.buildOutlineTree(firstObj, &outline.PdfOutlineTreeNode, nil)
   382  				if err != nil {
   383  					return nil, nil, err
   384  				}
   385  				outline.First = first
   386  				outline.Last = last
   387  			}
   388  		}
   389  
   390  		/*
   391  			if nextObj, hasNext := (*dict)["Next"]; hasNext {
   392  				nextObj, err = this.traceToObject(nextObj)
   393  				if err != nil {
   394  					return nil, nil, err
   395  				}
   396  				if _, isNull := nextObj.(*PdfObjectNull); !isNull {
   397  					next, last, err := this.buildOutlineTree(nextObj, parent, &outline.PdfOutlineTreeNode)
   398  					if err != nil {
   399  						return nil, nil, err
   400  					}
   401  					outline.Next = next
   402  					return &outline.PdfOutlineTreeNode, last, nil
   403  				}
   404  			}*/
   405  
   406  		return &outline.PdfOutlineTreeNode, &outline.PdfOutlineTreeNode, nil
   407  	}
   408  }
   409  
   410  // GetOutlineTree returns the outline tree.
   411  func (this *PdfReader) GetOutlineTree() *PdfOutlineTreeNode {
   412  	return this.outlineTree
   413  }
   414  
   415  // GetOutlinesFlattened returns a flattened list of tree nodes and titles.
   416  func (this *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, error) {
   417  	outlineNodeList := []*PdfOutlineTreeNode{}
   418  	flattenedTitleList := []string{}
   419  
   420  	// Recursive flattening function.
   421  	var flattenFunc func(*PdfOutlineTreeNode, *[]*PdfOutlineTreeNode, *[]string, int)
   422  	flattenFunc = func(node *PdfOutlineTreeNode, outlineList *[]*PdfOutlineTreeNode, titleList *[]string, depth int) {
   423  		if node == nil {
   424  			return
   425  		}
   426  		if node.context == nil {
   427  			common.Log.Debug("ERROR: Missing node.context") // Should not happen ever.
   428  			return
   429  		}
   430  
   431  		if item, isItem := node.context.(*PdfOutlineItem); isItem {
   432  			*outlineList = append(*outlineList, &item.PdfOutlineTreeNode)
   433  			title := strings.Repeat(" ", depth*2) + string(*item.Title)
   434  			*titleList = append(*titleList, title)
   435  			if item.Next != nil {
   436  				flattenFunc(item.Next, outlineList, titleList, depth)
   437  			}
   438  		}
   439  
   440  		if node.First != nil {
   441  			title := strings.Repeat(" ", depth*2) + "+"
   442  			*titleList = append(*titleList, title)
   443  			flattenFunc(node.First, outlineList, titleList, depth+1)
   444  		}
   445  	}
   446  	flattenFunc(this.outlineTree, &outlineNodeList, &flattenedTitleList, 0)
   447  	return outlineNodeList, flattenedTitleList, nil
   448  }
   449  
   450  // loadForms loads the AcroForm.
   451  func (this *PdfReader) loadForms() (*PdfAcroForm, error) {
   452  	if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() {
   453  		return nil, fmt.Errorf("File need to be decrypted first")
   454  	}
   455  
   456  	// Has forms?
   457  	catalog := this.catalog
   458  	obj := catalog.Get("AcroForm")
   459  	if obj == nil {
   460  		// Nothing to load.
   461  		return nil, nil
   462  	}
   463  	var err error
   464  	obj, err = this.traceToObject(obj)
   465  	if err != nil {
   466  		return nil, err
   467  	}
   468  	obj = TraceToDirectObject(obj)
   469  	if _, isNull := obj.(*PdfObjectNull); isNull {
   470  		common.Log.Trace("Acroform is a null object (empty)\n")
   471  		return nil, nil
   472  	}
   473  
   474  	formsDict, ok := obj.(*PdfObjectDictionary)
   475  	if !ok {
   476  		common.Log.Debug("Invalid AcroForm entry %T", obj)
   477  		common.Log.Debug("Does not have forms")
   478  		return nil, fmt.Errorf("Invalid acroform entry %T", obj)
   479  	}
   480  	common.Log.Trace("Has Acro forms")
   481  	// Load it.
   482  
   483  	// Ensure we have access to everything.
   484  	common.Log.Trace("Traverse the Acroforms structure")
   485  	err = this.traverseObjectData(formsDict)
   486  	if err != nil {
   487  		common.Log.Debug("ERROR: Unable to traverse AcroForms (%s)", err)
   488  		return nil, err
   489  	}
   490  
   491  	// Create the acro forms object.
   492  	acroForm, err := this.newPdfAcroFormFromDict(formsDict)
   493  	if err != nil {
   494  		return nil, err
   495  	}
   496  
   497  	return acroForm, nil
   498  }
   499  
   500  func (this *PdfReader) lookupPageByObject(obj PdfObject) (*PdfPage, error) {
   501  	// can be indirect, direct, or reference
   502  	// look up the corresponding page
   503  	return nil, errors.New("Page not found")
   504  }
   505  
   506  // Build the table of contents.
   507  // tree, ex: Pages -> Pages -> Pages -> Page
   508  // Traverse through the whole thing recursively.
   509  func (this *PdfReader) buildPageList(node *PdfIndirectObject, parent *PdfIndirectObject, traversedPageNodes map[PdfObject]bool) error {
   510  	if node == nil {
   511  		return nil
   512  	}
   513  
   514  	if _, alreadyTraversed := traversedPageNodes[node]; alreadyTraversed {
   515  		common.Log.Debug("Cyclic recursion, skipping")
   516  		return nil
   517  	}
   518  	traversedPageNodes[node] = true
   519  
   520  	nodeDict, ok := node.PdfObject.(*PdfObjectDictionary)
   521  	if !ok {
   522  		return errors.New("Node not a dictionary")
   523  	}
   524  
   525  	objType, ok := (*nodeDict).Get("Type").(*PdfObjectName)
   526  	if !ok {
   527  		return errors.New("Node missing Type (Required)")
   528  	}
   529  	common.Log.Trace("buildPageList node type: %s", *objType)
   530  	if *objType == "Page" {
   531  		p, err := this.newPdfPageFromDict(nodeDict)
   532  		if err != nil {
   533  			return err
   534  		}
   535  		p.setContainer(node)
   536  
   537  		if parent != nil {
   538  			// Set the parent (in case missing or incorrect).
   539  			nodeDict.Set("Parent", parent)
   540  		}
   541  		this.pageList = append(this.pageList, node)
   542  		this.PageList = append(this.PageList, p)
   543  
   544  		return nil
   545  	}
   546  	if *objType != "Pages" {
   547  		common.Log.Debug("ERROR: Table of content containing non Page/Pages object! (%s)", objType)
   548  		return errors.New("Table of content containing non Page/Pages object!")
   549  	}
   550  
   551  	// A Pages object.  Update the parent.
   552  	if parent != nil {
   553  		nodeDict.Set("Parent", parent)
   554  	}
   555  
   556  	// Resolve the object recursively.
   557  	err := this.traverseObjectData(node)
   558  	if err != nil {
   559  		return err
   560  	}
   561  
   562  	kidsObj, err := this.parser.Trace(nodeDict.Get("Kids"))
   563  	if err != nil {
   564  		common.Log.Debug("ERROR: Failed loading Kids object")
   565  		return err
   566  	}
   567  
   568  	var kids *PdfObjectArray
   569  	kids, ok = kidsObj.(*PdfObjectArray)
   570  	if !ok {
   571  		kidsIndirect, isIndirect := kidsObj.(*PdfIndirectObject)
   572  		if !isIndirect {
   573  			return errors.New("Invalid Kids object")
   574  		}
   575  		kids, ok = kidsIndirect.PdfObject.(*PdfObjectArray)
   576  		if !ok {
   577  			return errors.New("Invalid Kids indirect object")
   578  		}
   579  	}
   580  	common.Log.Trace("Kids: %s", kids)
   581  	for idx, child := range *kids {
   582  		child, ok := child.(*PdfIndirectObject)
   583  		if !ok {
   584  			common.Log.Debug("ERROR: Page not indirect object - (%s)", child)
   585  			return errors.New("Page not indirect object")
   586  		}
   587  		(*kids)[idx] = child
   588  		err = this.buildPageList(child, node, traversedPageNodes)
   589  		if err != nil {
   590  			return err
   591  		}
   592  	}
   593  
   594  	return nil
   595  }
   596  
   597  // GetNumPages returns the number of pages in the document.
   598  func (this *PdfReader) GetNumPages() (int, error) {
   599  	if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() {
   600  		return 0, fmt.Errorf("File need to be decrypted first")
   601  	}
   602  	return len(this.pageList), nil
   603  }
   604  
   605  // Resolves a reference, returning the object and indicates whether or not
   606  // it was cached.
   607  func (this *PdfReader) resolveReference(ref *PdfObjectReference) (PdfObject, bool, error) {
   608  	cachedObj, isCached := this.parser.ObjCache[int(ref.ObjectNumber)]
   609  	if !isCached {
   610  		common.Log.Trace("Reader Lookup ref: %s", ref)
   611  		obj, err := this.parser.LookupByReference(*ref)
   612  		if err != nil {
   613  			return nil, false, err
   614  		}
   615  		this.parser.ObjCache[int(ref.ObjectNumber)] = obj
   616  		return obj, false, nil
   617  	}
   618  	return cachedObj, true, nil
   619  }
   620  
   621  /*
   622   * Recursively traverse through the page object data and look up
   623   * references to indirect objects.
   624   *
   625   * GH: Are we fully protected against circular references? (Add tests).
   626   */
   627  func (this *PdfReader) traverseObjectData(o PdfObject) error {
   628  	common.Log.Trace("Traverse object data")
   629  	if _, isTraversed := this.traversed[o]; isTraversed {
   630  		common.Log.Trace("-Already traversed...")
   631  		return nil
   632  	}
   633  	this.traversed[o] = true
   634  
   635  	if io, isIndirectObj := o.(*PdfIndirectObject); isIndirectObj {
   636  		common.Log.Trace("io: %s", io)
   637  		common.Log.Trace("- %s", io.PdfObject)
   638  		err := this.traverseObjectData(io.PdfObject)
   639  		return err
   640  	}
   641  
   642  	if so, isStreamObj := o.(*PdfObjectStream); isStreamObj {
   643  		err := this.traverseObjectData(so.PdfObjectDictionary)
   644  		return err
   645  	}
   646  
   647  	if dict, isDict := o.(*PdfObjectDictionary); isDict {
   648  		common.Log.Trace("- dict: %s", dict)
   649  		for _, name := range dict.Keys() {
   650  			v := dict.Get(name)
   651  			if ref, isRef := v.(*PdfObjectReference); isRef {
   652  				resolvedObj, _, err := this.resolveReference(ref)
   653  				if err != nil {
   654  					return err
   655  				}
   656  				dict.Set(name, resolvedObj)
   657  				err = this.traverseObjectData(resolvedObj)
   658  				if err != nil {
   659  					return err
   660  				}
   661  			} else {
   662  				err := this.traverseObjectData(v)
   663  				if err != nil {
   664  					return err
   665  				}
   666  			}
   667  		}
   668  		return nil
   669  	}
   670  
   671  	if arr, isArray := o.(*PdfObjectArray); isArray {
   672  		common.Log.Trace("- array: %s", arr)
   673  		for idx, v := range *arr {
   674  			if ref, isRef := v.(*PdfObjectReference); isRef {
   675  				resolvedObj, _, err := this.resolveReference(ref)
   676  				if err != nil {
   677  					return err
   678  				}
   679  				(*arr)[idx] = resolvedObj
   680  
   681  				err = this.traverseObjectData(resolvedObj)
   682  				if err != nil {
   683  					return err
   684  				}
   685  			} else {
   686  				err := this.traverseObjectData(v)
   687  				if err != nil {
   688  					return err
   689  				}
   690  			}
   691  		}
   692  		return nil
   693  	}
   694  
   695  	if _, isRef := o.(*PdfObjectReference); isRef {
   696  		common.Log.Debug("ERROR: Reader tracing a reference!")
   697  		return errors.New("Reader tracing a reference!")
   698  	}
   699  
   700  	return nil
   701  }
   702  
   703  // GetPageAsIndirectObject returns an indirect object containing the page dictionary for a specified page number.
   704  func (this *PdfReader) GetPageAsIndirectObject(pageNumber int) (PdfObject, error) {
   705  	if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() {
   706  		return nil, fmt.Errorf("File needs to be decrypted first")
   707  	}
   708  	if len(this.pageList) < pageNumber {
   709  		return nil, errors.New("Invalid page number (page count too short)")
   710  	}
   711  	page := this.pageList[pageNumber-1]
   712  
   713  	// Look up all references related to page and load everything.
   714  	err := this.traverseObjectData(page)
   715  	if err != nil {
   716  		return nil, err
   717  	}
   718  	common.Log.Trace("Page: %T %s", page, page)
   719  	common.Log.Trace("- %T %s", page.PdfObject, page.PdfObject)
   720  
   721  	return page, nil
   722  }
   723  
   724  // GetPage returns the PdfPage model for the specified page number.
   725  func (this *PdfReader) GetPage(pageNumber int) (*PdfPage, error) {
   726  	if this.parser.GetCrypter() != nil && !this.parser.IsAuthenticated() {
   727  		return nil, fmt.Errorf("File needs to be decrypted first")
   728  	}
   729  	if len(this.pageList) < pageNumber {
   730  		return nil, errors.New("Invalid page number (page count too short)")
   731  	}
   732  	idx := pageNumber - 1
   733  	if idx < 0 {
   734  		return nil, fmt.Errorf("Page numbering must start at 1")
   735  	}
   736  	page := this.PageList[idx]
   737  
   738  	return page, nil
   739  }
   740  
   741  // GetOCProperties returns the optional content properties PdfObject.
   742  func (this *PdfReader) GetOCProperties() (PdfObject, error) {
   743  	dict := this.catalog
   744  	obj := dict.Get("OCProperties")
   745  	var err error
   746  	obj, err = this.traceToObject(obj)
   747  	if err != nil {
   748  		return nil, err
   749  	}
   750  
   751  	// Resolve all references...
   752  	// Should be pretty safe. Should not be referencing to pages or
   753  	// any large structures.  Local structures and references
   754  	// to OC Groups.
   755  	err = this.traverseObjectData(obj)
   756  	if err != nil {
   757  		return nil, err
   758  	}
   759  
   760  	return obj, nil
   761  }
   762  
   763  // Inspect inspects the object types, subtypes and content in the PDF file returning a map of
   764  // object type to number of instances of each.
   765  func (this *PdfReader) Inspect() (map[string]int, error) {
   766  	return this.parser.Inspect()
   767  }
   768  
   769  // GetObjectNums returns the object numbers of the PDF objects in the file
   770  // Numbered objects are either indirect objects or stream objects.
   771  // e.g. objNums := pdfReader.GetObjectNums()
   772  // The underlying objects can then be accessed with
   773  // pdfReader.GetIndirectObjectByNumber(objNums[0]) for the first available object.
   774  func (r *PdfReader) GetObjectNums() []int {
   775  	return r.parser.GetObjectNums()
   776  }
   777  
   778  // GetIndirectObjectByNumber retrieves and returns a specific PdfObject by object number.
   779  func (this *PdfReader) GetIndirectObjectByNumber(number int) (PdfObject, error) {
   780  	obj, err := this.parser.LookupByNumber(number)
   781  	return obj, err
   782  }
   783  
   784  // GetTrailer returns the PDF's trailer dictionary.
   785  func (this *PdfReader) GetTrailer() (*PdfObjectDictionary, error) {
   786  	trailerDict := this.parser.GetTrailer()
   787  	if trailerDict == nil {
   788  		return nil, errors.New("Trailer missing")
   789  	}
   790  
   791  	return trailerDict, nil
   792  }