github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/core/crossrefs.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package core
     7  
     8  import (
     9  	"bufio"
    10  	"bytes"
    11  	"errors"
    12  	"os"
    13  	"strings"
    14  
    15  	"github.com/unidoc/unidoc/common"
    16  )
    17  
    18  // TODO (v3): Create a new type xrefType which can be an integer and can be used for improved type checking.
    19  // TODO (v3): Unexport these constants and rename with camelCase.
    20  const (
    21  	// XREF_TABLE_ENTRY indicates a normal xref table entry.
    22  	XREF_TABLE_ENTRY = iota
    23  
    24  	// XREF_OBJECT_STREAM indicates an xref entry in an xref object stream.
    25  	XREF_OBJECT_STREAM = iota
    26  )
    27  
    28  // XrefObject defines a cross reference entry which is a map between object number (with generation number) and the
    29  // location of the actual object, either as a file offset (xref table entry), or as a location within an xref
    30  // stream object (xref object stream).
    31  // TODO (v3): Unexport.
    32  type XrefObject struct {
    33  	xtype        int
    34  	objectNumber int
    35  	generation   int
    36  	// For normal xrefs (defined by OFFSET)
    37  	offset int64
    38  	// For xrefs to object streams.
    39  	osObjNumber int
    40  	osObjIndex  int
    41  }
    42  
    43  // XrefTable is a map between object number and corresponding XrefObject.
    44  // TODO (v3): Unexport.
    45  // TODO: Consider changing to a slice, so can maintain the object order without sorting when analyzing.
    46  type XrefTable map[int]XrefObject
    47  
    48  // ObjectStream represents an object stream's information which can contain multiple indirect objects.
    49  // The information specifies the number of objects and has information about offset locations for
    50  // each object.
    51  // TODO (v3): Unexport.
    52  type ObjectStream struct {
    53  	N       int // TODO (v3): Unexport.
    54  	ds      []byte
    55  	offsets map[int]int64
    56  }
    57  
    58  // ObjectStreams defines a map between object numbers (object streams only) and underlying ObjectStream information.
    59  type ObjectStreams map[int]ObjectStream
    60  
    61  // ObjectCache defines a map between object numbers and corresponding PdfObject. Serves as a cache for PdfObjects that
    62  // have already been parsed.
    63  // TODO (v3): Unexport.
    64  type ObjectCache map[int]PdfObject
    65  
    66  // Get an object from an object stream.
    67  func (parser *PdfParser) lookupObjectViaOS(sobjNumber int, objNum int) (PdfObject, error) {
    68  	var bufReader *bytes.Reader
    69  	var objstm ObjectStream
    70  	var cached bool
    71  
    72  	objstm, cached = parser.objstms[sobjNumber]
    73  	if !cached {
    74  		soi, err := parser.LookupByNumber(sobjNumber)
    75  		if err != nil {
    76  			common.Log.Debug("Missing object stream with number %d", sobjNumber)
    77  			return nil, err
    78  		}
    79  
    80  		so, ok := soi.(*PdfObjectStream)
    81  		if !ok {
    82  			return nil, errors.New("Invalid object stream")
    83  		}
    84  
    85  		if parser.crypter != nil && !parser.crypter.isDecrypted(so) {
    86  			return nil, errors.New("Need to decrypt the stream")
    87  		}
    88  
    89  		sod := so.PdfObjectDictionary
    90  		common.Log.Trace("so d: %s\n", *sod)
    91  		name, ok := sod.Get("Type").(*PdfObjectName)
    92  		if !ok {
    93  			common.Log.Debug("ERROR: Object stream should always have a Type")
    94  			return nil, errors.New("Object stream missing Type")
    95  		}
    96  		if strings.ToLower(string(*name)) != "objstm" {
    97  			common.Log.Debug("ERROR: Object stream type shall always be ObjStm !")
    98  			return nil, errors.New("Object stream type != ObjStm")
    99  		}
   100  
   101  		N, ok := sod.Get("N").(*PdfObjectInteger)
   102  		if !ok {
   103  			return nil, errors.New("Invalid N in stream dictionary")
   104  		}
   105  		firstOffset, ok := sod.Get("First").(*PdfObjectInteger)
   106  		if !ok {
   107  			return nil, errors.New("Invalid First in stream dictionary")
   108  		}
   109  
   110  		common.Log.Trace("type: %s number of objects: %d", name, *N)
   111  		ds, err := DecodeStream(so)
   112  		if err != nil {
   113  			return nil, err
   114  		}
   115  
   116  		common.Log.Trace("Decoded: %s", ds)
   117  
   118  		// Temporarily change the reader object to this decoded buffer.
   119  		// Change back afterwards.
   120  		bakOffset := parser.GetFileOffset()
   121  		defer func() { parser.SetFileOffset(bakOffset) }()
   122  
   123  		bufReader = bytes.NewReader(ds)
   124  		parser.reader = bufio.NewReader(bufReader)
   125  
   126  		common.Log.Trace("Parsing offset map")
   127  		// Load the offset map (relative to the beginning of the stream...)
   128  		offsets := map[int]int64{}
   129  		// Object list and offsets.
   130  		for i := 0; i < int(*N); i++ {
   131  			parser.skipSpaces()
   132  			// Object number.
   133  			obj, err := parser.parseNumber()
   134  			if err != nil {
   135  				return nil, err
   136  			}
   137  			onum, ok := obj.(*PdfObjectInteger)
   138  			if !ok {
   139  				return nil, errors.New("Invalid object stream offset table")
   140  			}
   141  
   142  			parser.skipSpaces()
   143  			// Offset.
   144  			obj, err = parser.parseNumber()
   145  			if err != nil {
   146  				return nil, err
   147  			}
   148  			offset, ok := obj.(*PdfObjectInteger)
   149  			if !ok {
   150  				return nil, errors.New("Invalid object stream offset table")
   151  			}
   152  
   153  			common.Log.Trace("obj %d offset %d", *onum, *offset)
   154  			offsets[int(*onum)] = int64(*firstOffset + *offset)
   155  		}
   156  
   157  		objstm = ObjectStream{N: int(*N), ds: ds, offsets: offsets}
   158  		parser.objstms[sobjNumber] = objstm
   159  	} else {
   160  		// Temporarily change the reader object to this decoded buffer.
   161  		// Point back afterwards.
   162  		bakOffset := parser.GetFileOffset()
   163  		defer func() { parser.SetFileOffset(bakOffset) }()
   164  
   165  		bufReader = bytes.NewReader(objstm.ds)
   166  		// Temporarily change the reader object to this decoded buffer.
   167  		parser.reader = bufio.NewReader(bufReader)
   168  	}
   169  
   170  	offset := objstm.offsets[objNum]
   171  	common.Log.Trace("ACTUAL offset[%d] = %d", objNum, offset)
   172  
   173  	bufReader.Seek(offset, os.SEEK_SET)
   174  	parser.reader = bufio.NewReader(bufReader)
   175  
   176  	bb, _ := parser.reader.Peek(100)
   177  	common.Log.Trace("OBJ peek \"%s\"", string(bb))
   178  
   179  	val, err := parser.parseObject()
   180  	if err != nil {
   181  		common.Log.Debug("ERROR Fail to read object (%s)", err)
   182  		return nil, err
   183  	}
   184  	if val == nil {
   185  		return nil, errors.New("Object cannot be null")
   186  	}
   187  
   188  	// Make an indirect object around it.
   189  	io := PdfIndirectObject{}
   190  	io.ObjectNumber = int64(objNum)
   191  	io.PdfObject = val
   192  
   193  	return &io, nil
   194  }
   195  
   196  // LookupByNumber looks up a PdfObject by object number.  Returns an error on failure.
   197  // TODO (v3): Unexport.
   198  func (parser *PdfParser) LookupByNumber(objNumber int) (PdfObject, error) {
   199  	// Outside interface for lookupByNumberWrapper.  Default attempts repairs of bad xref tables.
   200  	obj, _, err := parser.lookupByNumberWrapper(objNumber, true)
   201  	return obj, err
   202  }
   203  
   204  // Wrapper for lookupByNumber, checks if object encrypted etc.
   205  func (parser *PdfParser) lookupByNumberWrapper(objNumber int, attemptRepairs bool) (PdfObject, bool, error) {
   206  	obj, inObjStream, err := parser.lookupByNumber(objNumber, attemptRepairs)
   207  	if err != nil {
   208  		return nil, inObjStream, err
   209  	}
   210  
   211  	// If encrypted, decrypt it prior to returning.
   212  	// Do not attempt to decrypt objects within object streams.
   213  	if !inObjStream && parser.crypter != nil && !parser.crypter.isDecrypted(obj) {
   214  		err := parser.crypter.Decrypt(obj, 0, 0)
   215  		if err != nil {
   216  			return nil, inObjStream, err
   217  		}
   218  	}
   219  
   220  	return obj, inObjStream, nil
   221  }
   222  
   223  func getObjectNumber(obj PdfObject) (int64, int64, error) {
   224  	if io, isIndirect := obj.(*PdfIndirectObject); isIndirect {
   225  		return io.ObjectNumber, io.GenerationNumber, nil
   226  	}
   227  	if so, isStream := obj.(*PdfObjectStream); isStream {
   228  		return so.ObjectNumber, so.GenerationNumber, nil
   229  	}
   230  	return 0, 0, errors.New("Not an indirect/stream object")
   231  }
   232  
   233  // LookupByNumber
   234  // Repair signals whether to repair if broken.
   235  func (parser *PdfParser) lookupByNumber(objNumber int, attemptRepairs bool) (PdfObject, bool, error) {
   236  	obj, ok := parser.ObjCache[objNumber]
   237  	if ok {
   238  		common.Log.Trace("Returning cached object %d", objNumber)
   239  		return obj, false, nil
   240  	}
   241  
   242  	xref, ok := parser.xrefs[objNumber]
   243  	if !ok {
   244  		// An indirect reference to an undefined object shall not be
   245  		// considered an error by a conforming reader; it shall be
   246  		// treated as a reference to the null object.
   247  		common.Log.Trace("Unable to locate object in xrefs! - Returning null object")
   248  		var nullObj PdfObjectNull
   249  		return &nullObj, false, nil
   250  	}
   251  
   252  	common.Log.Trace("Lookup obj number %d", objNumber)
   253  	if xref.xtype == XREF_TABLE_ENTRY {
   254  		common.Log.Trace("xrefobj obj num %d", xref.objectNumber)
   255  		common.Log.Trace("xrefobj gen %d", xref.generation)
   256  		common.Log.Trace("xrefobj offset %d", xref.offset)
   257  
   258  		parser.rs.Seek(xref.offset, os.SEEK_SET)
   259  		parser.reader = bufio.NewReader(parser.rs)
   260  
   261  		obj, err := parser.ParseIndirectObject()
   262  		if err != nil {
   263  			common.Log.Debug("ERROR Failed reading xref (%s)", err)
   264  			// Offset pointing to a non-object.  Try to repair the file.
   265  			if attemptRepairs {
   266  				common.Log.Debug("Attempting to repair xrefs (top down)")
   267  				xrefTable, err := parser.repairRebuildXrefsTopDown()
   268  				if err != nil {
   269  					common.Log.Debug("ERROR Failed repair (%s)", err)
   270  					return nil, false, err
   271  				}
   272  				parser.xrefs = *xrefTable
   273  				return parser.lookupByNumber(objNumber, false)
   274  			}
   275  			return nil, false, err
   276  		}
   277  
   278  		if attemptRepairs {
   279  			// Check the object number..
   280  			// If it does not match, then try to rebuild, i.e. loop through
   281  			// all the items in the xref and look each one up and correct.
   282  			realObjNum, _, _ := getObjectNumber(obj)
   283  			if int(realObjNum) != objNumber {
   284  				common.Log.Debug("Invalid xrefs: Rebuilding")
   285  				err := parser.rebuildXrefTable()
   286  				if err != nil {
   287  					return nil, false, err
   288  				}
   289  				// Empty the cache.
   290  				parser.ObjCache = ObjectCache{}
   291  				// Try looking up again and return.
   292  				return parser.lookupByNumberWrapper(objNumber, false)
   293  			}
   294  		}
   295  
   296  		common.Log.Trace("Returning obj")
   297  		parser.ObjCache[objNumber] = obj
   298  		return obj, false, nil
   299  	} else if xref.xtype == XREF_OBJECT_STREAM {
   300  		common.Log.Trace("xref from object stream!")
   301  		common.Log.Trace(">Load via OS!")
   302  		common.Log.Trace("Object stream available in object %d/%d", xref.osObjNumber, xref.osObjIndex)
   303  
   304  		if xref.osObjNumber == objNumber {
   305  			common.Log.Debug("ERROR Circular reference!?!")
   306  			return nil, true, errors.New("Xref circular reference")
   307  		}
   308  		_, exists := parser.xrefs[xref.osObjNumber]
   309  		if exists {
   310  			optr, err := parser.lookupObjectViaOS(xref.osObjNumber, objNumber) //xref.osObjIndex)
   311  			if err != nil {
   312  				common.Log.Debug("ERROR Returning ERR (%s)", err)
   313  				return nil, true, err
   314  			}
   315  			common.Log.Trace("<Loaded via OS")
   316  			parser.ObjCache[objNumber] = optr
   317  			if parser.crypter != nil {
   318  				// Mark as decrypted (inside object stream) for caching.
   319  				// and avoid decrypting decrypted object.
   320  				parser.crypter.DecryptedObjects[optr] = true
   321  			}
   322  			return optr, true, nil
   323  		} else {
   324  			common.Log.Debug("?? Belongs to a non-cross referenced object ...!")
   325  			return nil, true, errors.New("OS belongs to a non cross referenced object")
   326  		}
   327  	}
   328  	return nil, false, errors.New("Unknown xref type")
   329  }
   330  
   331  // LookupByReference looks up a PdfObject by a reference.
   332  func (parser *PdfParser) LookupByReference(ref PdfObjectReference) (PdfObject, error) {
   333  	common.Log.Trace("Looking up reference %s", ref.String())
   334  	return parser.LookupByNumber(int(ref.ObjectNumber))
   335  }
   336  
   337  // Trace traces a PdfObject to direct object, looking up and resolving references as needed (unlike TraceToDirect).
   338  // TODO (v3): Unexport.
   339  func (parser *PdfParser) Trace(obj PdfObject) (PdfObject, error) {
   340  	ref, isRef := obj.(*PdfObjectReference)
   341  	if !isRef {
   342  		// Direct object already.
   343  		return obj, nil
   344  	}
   345  
   346  	bakOffset := parser.GetFileOffset()
   347  	defer func() { parser.SetFileOffset(bakOffset) }()
   348  
   349  	o, err := parser.LookupByReference(*ref)
   350  	if err != nil {
   351  		return nil, err
   352  	}
   353  
   354  	io, isInd := o.(*PdfIndirectObject)
   355  	if !isInd {
   356  		// Not indirect (Stream or null object).
   357  		return o, nil
   358  	}
   359  	o = io.PdfObject
   360  	_, isRef = o.(*PdfObjectReference)
   361  	if isRef {
   362  		return io, errors.New("Multi depth trace pointer to pointer")
   363  	}
   364  
   365  	return o, nil
   366  }
   367  
   368  func printXrefTable(xrefTable XrefTable) {
   369  	common.Log.Debug("=X=X=X=")
   370  	common.Log.Debug("Xref table:")
   371  	i := 0
   372  	for _, xref := range xrefTable {
   373  		common.Log.Debug("i+1: %d (obj num: %d gen: %d) -> %d", i+1, xref.objectNumber, xref.generation, xref.offset)
   374  		i++
   375  	}
   376  }