github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/core/crossrefs.go (about) 1 /* 2 * This file is subject to the terms and conditions defined in 3 * file 'LICENSE.md', which is part of this source code package. 4 */ 5 6 package core 7 8 import ( 9 "bufio" 10 "bytes" 11 "errors" 12 "os" 13 "strings" 14 15 "github.com/unidoc/unidoc/common" 16 ) 17 18 // TODO (v3): Create a new type xrefType which can be an integer and can be used for improved type checking. 19 // TODO (v3): Unexport these constants and rename with camelCase. 20 const ( 21 // XREF_TABLE_ENTRY indicates a normal xref table entry. 22 XREF_TABLE_ENTRY = iota 23 24 // XREF_OBJECT_STREAM indicates an xref entry in an xref object stream. 25 XREF_OBJECT_STREAM = iota 26 ) 27 28 // XrefObject defines a cross reference entry which is a map between object number (with generation number) and the 29 // location of the actual object, either as a file offset (xref table entry), or as a location within an xref 30 // stream object (xref object stream). 31 // TODO (v3): Unexport. 32 type XrefObject struct { 33 xtype int 34 objectNumber int 35 generation int 36 // For normal xrefs (defined by OFFSET) 37 offset int64 38 // For xrefs to object streams. 39 osObjNumber int 40 osObjIndex int 41 } 42 43 // XrefTable is a map between object number and corresponding XrefObject. 44 // TODO (v3): Unexport. 45 // TODO: Consider changing to a slice, so can maintain the object order without sorting when analyzing. 46 type XrefTable map[int]XrefObject 47 48 // ObjectStream represents an object stream's information which can contain multiple indirect objects. 49 // The information specifies the number of objects and has information about offset locations for 50 // each object. 51 // TODO (v3): Unexport. 52 type ObjectStream struct { 53 N int // TODO (v3): Unexport. 54 ds []byte 55 offsets map[int]int64 56 } 57 58 // ObjectStreams defines a map between object numbers (object streams only) and underlying ObjectStream information. 59 type ObjectStreams map[int]ObjectStream 60 61 // ObjectCache defines a map between object numbers and corresponding PdfObject. Serves as a cache for PdfObjects that 62 // have already been parsed. 63 // TODO (v3): Unexport. 64 type ObjectCache map[int]PdfObject 65 66 // Get an object from an object stream. 67 func (parser *PdfParser) lookupObjectViaOS(sobjNumber int, objNum int) (PdfObject, error) { 68 var bufReader *bytes.Reader 69 var objstm ObjectStream 70 var cached bool 71 72 objstm, cached = parser.objstms[sobjNumber] 73 if !cached { 74 soi, err := parser.LookupByNumber(sobjNumber) 75 if err != nil { 76 common.Log.Debug("Missing object stream with number %d", sobjNumber) 77 return nil, err 78 } 79 80 so, ok := soi.(*PdfObjectStream) 81 if !ok { 82 return nil, errors.New("Invalid object stream") 83 } 84 85 if parser.crypter != nil && !parser.crypter.isDecrypted(so) { 86 return nil, errors.New("Need to decrypt the stream") 87 } 88 89 sod := so.PdfObjectDictionary 90 common.Log.Trace("so d: %s\n", *sod) 91 name, ok := sod.Get("Type").(*PdfObjectName) 92 if !ok { 93 common.Log.Debug("ERROR: Object stream should always have a Type") 94 return nil, errors.New("Object stream missing Type") 95 } 96 if strings.ToLower(string(*name)) != "objstm" { 97 common.Log.Debug("ERROR: Object stream type shall always be ObjStm !") 98 return nil, errors.New("Object stream type != ObjStm") 99 } 100 101 N, ok := sod.Get("N").(*PdfObjectInteger) 102 if !ok { 103 return nil, errors.New("Invalid N in stream dictionary") 104 } 105 firstOffset, ok := sod.Get("First").(*PdfObjectInteger) 106 if !ok { 107 return nil, errors.New("Invalid First in stream dictionary") 108 } 109 110 common.Log.Trace("type: %s number of objects: %d", name, *N) 111 ds, err := DecodeStream(so) 112 if err != nil { 113 return nil, err 114 } 115 116 common.Log.Trace("Decoded: %s", ds) 117 118 // Temporarily change the reader object to this decoded buffer. 119 // Change back afterwards. 120 bakOffset := parser.GetFileOffset() 121 defer func() { parser.SetFileOffset(bakOffset) }() 122 123 bufReader = bytes.NewReader(ds) 124 parser.reader = bufio.NewReader(bufReader) 125 126 common.Log.Trace("Parsing offset map") 127 // Load the offset map (relative to the beginning of the stream...) 128 offsets := map[int]int64{} 129 // Object list and offsets. 130 for i := 0; i < int(*N); i++ { 131 parser.skipSpaces() 132 // Object number. 133 obj, err := parser.parseNumber() 134 if err != nil { 135 return nil, err 136 } 137 onum, ok := obj.(*PdfObjectInteger) 138 if !ok { 139 return nil, errors.New("Invalid object stream offset table") 140 } 141 142 parser.skipSpaces() 143 // Offset. 144 obj, err = parser.parseNumber() 145 if err != nil { 146 return nil, err 147 } 148 offset, ok := obj.(*PdfObjectInteger) 149 if !ok { 150 return nil, errors.New("Invalid object stream offset table") 151 } 152 153 common.Log.Trace("obj %d offset %d", *onum, *offset) 154 offsets[int(*onum)] = int64(*firstOffset + *offset) 155 } 156 157 objstm = ObjectStream{N: int(*N), ds: ds, offsets: offsets} 158 parser.objstms[sobjNumber] = objstm 159 } else { 160 // Temporarily change the reader object to this decoded buffer. 161 // Point back afterwards. 162 bakOffset := parser.GetFileOffset() 163 defer func() { parser.SetFileOffset(bakOffset) }() 164 165 bufReader = bytes.NewReader(objstm.ds) 166 // Temporarily change the reader object to this decoded buffer. 167 parser.reader = bufio.NewReader(bufReader) 168 } 169 170 offset := objstm.offsets[objNum] 171 common.Log.Trace("ACTUAL offset[%d] = %d", objNum, offset) 172 173 bufReader.Seek(offset, os.SEEK_SET) 174 parser.reader = bufio.NewReader(bufReader) 175 176 bb, _ := parser.reader.Peek(100) 177 common.Log.Trace("OBJ peek \"%s\"", string(bb)) 178 179 val, err := parser.parseObject() 180 if err != nil { 181 common.Log.Debug("ERROR Fail to read object (%s)", err) 182 return nil, err 183 } 184 if val == nil { 185 return nil, errors.New("Object cannot be null") 186 } 187 188 // Make an indirect object around it. 189 io := PdfIndirectObject{} 190 io.ObjectNumber = int64(objNum) 191 io.PdfObject = val 192 193 return &io, nil 194 } 195 196 // LookupByNumber looks up a PdfObject by object number. Returns an error on failure. 197 // TODO (v3): Unexport. 198 func (parser *PdfParser) LookupByNumber(objNumber int) (PdfObject, error) { 199 // Outside interface for lookupByNumberWrapper. Default attempts repairs of bad xref tables. 200 obj, _, err := parser.lookupByNumberWrapper(objNumber, true) 201 return obj, err 202 } 203 204 // Wrapper for lookupByNumber, checks if object encrypted etc. 205 func (parser *PdfParser) lookupByNumberWrapper(objNumber int, attemptRepairs bool) (PdfObject, bool, error) { 206 obj, inObjStream, err := parser.lookupByNumber(objNumber, attemptRepairs) 207 if err != nil { 208 return nil, inObjStream, err 209 } 210 211 // If encrypted, decrypt it prior to returning. 212 // Do not attempt to decrypt objects within object streams. 213 if !inObjStream && parser.crypter != nil && !parser.crypter.isDecrypted(obj) { 214 err := parser.crypter.Decrypt(obj, 0, 0) 215 if err != nil { 216 return nil, inObjStream, err 217 } 218 } 219 220 return obj, inObjStream, nil 221 } 222 223 func getObjectNumber(obj PdfObject) (int64, int64, error) { 224 if io, isIndirect := obj.(*PdfIndirectObject); isIndirect { 225 return io.ObjectNumber, io.GenerationNumber, nil 226 } 227 if so, isStream := obj.(*PdfObjectStream); isStream { 228 return so.ObjectNumber, so.GenerationNumber, nil 229 } 230 return 0, 0, errors.New("Not an indirect/stream object") 231 } 232 233 // LookupByNumber 234 // Repair signals whether to repair if broken. 235 func (parser *PdfParser) lookupByNumber(objNumber int, attemptRepairs bool) (PdfObject, bool, error) { 236 obj, ok := parser.ObjCache[objNumber] 237 if ok { 238 common.Log.Trace("Returning cached object %d", objNumber) 239 return obj, false, nil 240 } 241 242 xref, ok := parser.xrefs[objNumber] 243 if !ok { 244 // An indirect reference to an undefined object shall not be 245 // considered an error by a conforming reader; it shall be 246 // treated as a reference to the null object. 247 common.Log.Trace("Unable to locate object in xrefs! - Returning null object") 248 var nullObj PdfObjectNull 249 return &nullObj, false, nil 250 } 251 252 common.Log.Trace("Lookup obj number %d", objNumber) 253 if xref.xtype == XREF_TABLE_ENTRY { 254 common.Log.Trace("xrefobj obj num %d", xref.objectNumber) 255 common.Log.Trace("xrefobj gen %d", xref.generation) 256 common.Log.Trace("xrefobj offset %d", xref.offset) 257 258 parser.rs.Seek(xref.offset, os.SEEK_SET) 259 parser.reader = bufio.NewReader(parser.rs) 260 261 obj, err := parser.ParseIndirectObject() 262 if err != nil { 263 common.Log.Debug("ERROR Failed reading xref (%s)", err) 264 // Offset pointing to a non-object. Try to repair the file. 265 if attemptRepairs { 266 common.Log.Debug("Attempting to repair xrefs (top down)") 267 xrefTable, err := parser.repairRebuildXrefsTopDown() 268 if err != nil { 269 common.Log.Debug("ERROR Failed repair (%s)", err) 270 return nil, false, err 271 } 272 parser.xrefs = *xrefTable 273 return parser.lookupByNumber(objNumber, false) 274 } 275 return nil, false, err 276 } 277 278 if attemptRepairs { 279 // Check the object number.. 280 // If it does not match, then try to rebuild, i.e. loop through 281 // all the items in the xref and look each one up and correct. 282 realObjNum, _, _ := getObjectNumber(obj) 283 if int(realObjNum) != objNumber { 284 common.Log.Debug("Invalid xrefs: Rebuilding") 285 err := parser.rebuildXrefTable() 286 if err != nil { 287 return nil, false, err 288 } 289 // Empty the cache. 290 parser.ObjCache = ObjectCache{} 291 // Try looking up again and return. 292 return parser.lookupByNumberWrapper(objNumber, false) 293 } 294 } 295 296 common.Log.Trace("Returning obj") 297 parser.ObjCache[objNumber] = obj 298 return obj, false, nil 299 } else if xref.xtype == XREF_OBJECT_STREAM { 300 common.Log.Trace("xref from object stream!") 301 common.Log.Trace(">Load via OS!") 302 common.Log.Trace("Object stream available in object %d/%d", xref.osObjNumber, xref.osObjIndex) 303 304 if xref.osObjNumber == objNumber { 305 common.Log.Debug("ERROR Circular reference!?!") 306 return nil, true, errors.New("Xref circular reference") 307 } 308 _, exists := parser.xrefs[xref.osObjNumber] 309 if exists { 310 optr, err := parser.lookupObjectViaOS(xref.osObjNumber, objNumber) //xref.osObjIndex) 311 if err != nil { 312 common.Log.Debug("ERROR Returning ERR (%s)", err) 313 return nil, true, err 314 } 315 common.Log.Trace("<Loaded via OS") 316 parser.ObjCache[objNumber] = optr 317 if parser.crypter != nil { 318 // Mark as decrypted (inside object stream) for caching. 319 // and avoid decrypting decrypted object. 320 parser.crypter.DecryptedObjects[optr] = true 321 } 322 return optr, true, nil 323 } else { 324 common.Log.Debug("?? Belongs to a non-cross referenced object ...!") 325 return nil, true, errors.New("OS belongs to a non cross referenced object") 326 } 327 } 328 return nil, false, errors.New("Unknown xref type") 329 } 330 331 // LookupByReference looks up a PdfObject by a reference. 332 func (parser *PdfParser) LookupByReference(ref PdfObjectReference) (PdfObject, error) { 333 common.Log.Trace("Looking up reference %s", ref.String()) 334 return parser.LookupByNumber(int(ref.ObjectNumber)) 335 } 336 337 // Trace traces a PdfObject to direct object, looking up and resolving references as needed (unlike TraceToDirect). 338 // TODO (v3): Unexport. 339 func (parser *PdfParser) Trace(obj PdfObject) (PdfObject, error) { 340 ref, isRef := obj.(*PdfObjectReference) 341 if !isRef { 342 // Direct object already. 343 return obj, nil 344 } 345 346 bakOffset := parser.GetFileOffset() 347 defer func() { parser.SetFileOffset(bakOffset) }() 348 349 o, err := parser.LookupByReference(*ref) 350 if err != nil { 351 return nil, err 352 } 353 354 io, isInd := o.(*PdfIndirectObject) 355 if !isInd { 356 // Not indirect (Stream or null object). 357 return o, nil 358 } 359 o = io.PdfObject 360 _, isRef = o.(*PdfObjectReference) 361 if isRef { 362 return io, errors.New("Multi depth trace pointer to pointer") 363 } 364 365 return o, nil 366 } 367 368 func printXrefTable(xrefTable XrefTable) { 369 common.Log.Debug("=X=X=X=") 370 common.Log.Debug("Xref table:") 371 i := 0 372 for _, xref := range xrefTable { 373 common.Log.Debug("i+1: %d (obj num: %d gen: %d) -> %d", i+1, xref.objectNumber, xref.generation, xref.offset) 374 i++ 375 } 376 }