github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/core/repairs.go (about) 1 /* 2 * This file is subject to the terms and conditions defined in 3 * file 'LICENSE.md', which is part of this source code package. 4 */ 5 6 // Routines related to repairing malformed pdf files. 7 8 package core 9 10 import ( 11 "errors" 12 "fmt" 13 "os" 14 "regexp" 15 16 "bufio" 17 "io" 18 "strconv" 19 20 "github.com/unidoc/unidoc/common" 21 ) 22 23 var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`) 24 25 // Locates a standard Xref table by looking for the "xref" entry. 26 // Xref object stream not supported. 27 func (parser *PdfParser) repairLocateXref() (int64, error) { 28 readBuf := int64(1000) 29 parser.rs.Seek(-readBuf, os.SEEK_CUR) 30 31 curOffset, err := parser.rs.Seek(0, os.SEEK_CUR) 32 if err != nil { 33 return 0, err 34 } 35 b2 := make([]byte, readBuf) 36 parser.rs.Read(b2) 37 38 results := repairReXrefTable.FindAllStringIndex(string(b2), -1) 39 if len(results) < 1 { 40 common.Log.Debug("ERROR: Repair: xref not found!") 41 return 0, errors.New("Repair: xref not found") 42 } 43 44 localOffset := int64(results[len(results)-1][0]) 45 xrefOffset := curOffset + localOffset 46 return xrefOffset, nil 47 } 48 49 // Renumbers the xref table. 50 // Useful when the cross reference is pointing to an object with the wrong number. 51 // Update the table. 52 func (parser *PdfParser) rebuildXrefTable() error { 53 newXrefs := XrefTable{} 54 for objNum, xref := range parser.xrefs { 55 obj, _, err := parser.lookupByNumberWrapper(objNum, false) 56 if err != nil { 57 common.Log.Debug("ERROR: Unable to look up object (%s)", err) 58 common.Log.Debug("ERROR: Xref table completely broken - attempting to repair ") 59 xrefTable, err := parser.repairRebuildXrefsTopDown() 60 if err != nil { 61 common.Log.Debug("ERROR: Failed xref rebuild repair (%s)", err) 62 return err 63 } 64 parser.xrefs = *xrefTable 65 common.Log.Debug("Repaired xref table built") 66 return nil 67 } 68 actObjNum, actGenNum, err := getObjectNumber(obj) 69 if err != nil { 70 return err 71 } 72 73 xref.objectNumber = int(actObjNum) 74 xref.generation = int(actGenNum) 75 newXrefs[int(actObjNum)] = xref 76 } 77 78 parser.xrefs = newXrefs 79 common.Log.Debug("New xref table built") 80 printXrefTable(parser.xrefs) 81 return nil 82 } 83 84 // Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil). 85 func parseObjectNumberFromString(str string) (int, int, error) { 86 result := reIndirectObject.FindStringSubmatch(str) 87 if len(result) < 3 { 88 return 0, 0, errors.New("Unable to detect indirect object signature") 89 } 90 91 on, _ := strconv.Atoi(result[1]) 92 gn, _ := strconv.Atoi(result[2]) 93 94 return on, gn, nil 95 } 96 97 // Parse the entire file from top down. 98 // Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns. 99 // N.B. This collects the XREF_TABLE_ENTRY data only. 100 func (parser *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) { 101 if parser.repairsAttempted { 102 // Avoid multiple repairs (only try once). 103 return nil, fmt.Errorf("Repair failed") 104 } 105 parser.repairsAttempted = true 106 107 // Go to beginning, reset reader. 108 parser.rs.Seek(0, os.SEEK_SET) 109 parser.reader = bufio.NewReader(parser.rs) 110 111 // Keep a running buffer of last bytes. 112 bufLen := 20 113 last := make([]byte, bufLen) 114 115 xrefTable := XrefTable{} 116 for { 117 b, err := parser.reader.ReadByte() 118 if err != nil { 119 if err == io.EOF { 120 break 121 } else { 122 return nil, err 123 } 124 } 125 126 // Format: 127 // object number - whitespace - generation number - obj 128 // e.g. "12 0 obj" 129 if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) { 130 i := bufLen - 4 131 // Go past whitespace 132 for IsWhiteSpace(last[i]) && i > 0 { 133 i-- 134 } 135 if i == 0 || !IsDecimalDigit(last[i]) { 136 continue 137 } 138 // Go past generation number 139 for IsDecimalDigit(last[i]) && i > 0 { 140 i-- 141 } 142 if i == 0 || !IsWhiteSpace(last[i]) { 143 continue 144 } 145 // Go past whitespace 146 for IsWhiteSpace(last[i]) && i > 0 { 147 i-- 148 } 149 if i == 0 || !IsDecimalDigit(last[i]) { 150 continue 151 } 152 // Go past object number. 153 for IsDecimalDigit(last[i]) && i > 0 { 154 i-- 155 } 156 if i == 0 { 157 continue // Probably too long to be a valid object... 158 } 159 160 objOffset := parser.GetFileOffset() - int64(bufLen-i) 161 162 objstr := append(last[i+1:], b) 163 objNum, genNum, err := parseObjectNumberFromString(string(objstr)) 164 if err != nil { 165 common.Log.Debug("Unable to parse object number: %v", err) 166 return nil, err 167 } 168 169 // Create and insert the XREF entry if not existing, or the generation number is higher. 170 if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum { 171 // Make the entry for the cross ref table. 172 xrefEntry := XrefObject{} 173 xrefEntry.xtype = XREF_TABLE_ENTRY 174 xrefEntry.objectNumber = int(objNum) 175 xrefEntry.generation = int(genNum) 176 xrefEntry.offset = objOffset 177 xrefTable[objNum] = xrefEntry 178 } 179 } 180 181 last = append(last[1:bufLen], b) 182 } 183 184 return &xrefTable, nil 185 } 186 187 // Look for first sign of xref table from end of file. 188 func (parser *PdfParser) repairSeekXrefMarker() error { 189 // Get the file size. 190 fSize, err := parser.rs.Seek(0, os.SEEK_END) 191 if err != nil { 192 return err 193 } 194 195 reXrefTableStart := regexp.MustCompile(`\sxref\s*`) 196 197 // Define the starting point (from the end of the file) to search from. 198 var offset int64 = 0 199 200 // Define an buffer length in terms of how many bytes to read from the end of the file. 201 var buflen int64 = 1000 202 203 for offset < fSize { 204 if fSize <= (buflen + offset) { 205 buflen = fSize - offset 206 } 207 208 // Move back enough (as we need to read forward). 209 _, err := parser.rs.Seek(-offset-buflen, os.SEEK_END) 210 if err != nil { 211 return err 212 } 213 214 // Read the data. 215 b1 := make([]byte, buflen) 216 parser.rs.Read(b1) 217 218 common.Log.Trace("Looking for xref : \"%s\"", string(b1)) 219 ind := reXrefTableStart.FindAllStringIndex(string(b1), -1) 220 if ind != nil { 221 // Found it. 222 lastInd := ind[len(ind)-1] 223 common.Log.Trace("Ind: % d", ind) 224 parser.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END) 225 parser.reader = bufio.NewReader(parser.rs) 226 // Go past whitespace, finish at 'x'. 227 for { 228 bb, err := parser.reader.Peek(1) 229 if err != nil { 230 return err 231 } 232 common.Log.Trace("B: %d %c", bb[0], bb[0]) 233 if !IsWhiteSpace(bb[0]) { 234 break 235 } 236 parser.reader.Discard(1) 237 } 238 239 return nil 240 } else { 241 common.Log.Debug("Warning: EOF marker not found! - continue seeking") 242 } 243 244 offset += buflen 245 } 246 247 common.Log.Debug("Error: Xref table marker was not found.") 248 return errors.New("xref not found ") 249 } 250 251 // Called when Pdf version not found normally. Looks for the PDF version by scanning top-down. 252 // %PDF-1.7 253 func (parser *PdfParser) seekPdfVersionTopDown() (int, int, error) { 254 // Go to beginning, reset reader. 255 parser.rs.Seek(0, os.SEEK_SET) 256 parser.reader = bufio.NewReader(parser.rs) 257 258 // Keep a running buffer of last bytes. 259 bufLen := 20 260 last := make([]byte, bufLen) 261 262 for { 263 b, err := parser.reader.ReadByte() 264 if err != nil { 265 if err == io.EOF { 266 break 267 } else { 268 return 0, 0, err 269 } 270 } 271 272 // Format: 273 // object number - whitespace - generation number - obj 274 // e.g. "12 0 obj" 275 if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' && 276 last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' { 277 major := int(last[bufLen-2] - '0') 278 minor := int(b - '0') 279 return major, minor, nil 280 } 281 282 last = append(last[1:bufLen], b) 283 } 284 285 return 0, 0, errors.New("Version not found") 286 }