github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/core/repairs.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  // Routines related to repairing malformed pdf files.
     7  
     8  package core
     9  
    10  import (
    11  	"errors"
    12  	"fmt"
    13  	"os"
    14  	"regexp"
    15  
    16  	"bufio"
    17  	"io"
    18  	"strconv"
    19  
    20  	"github.com/unidoc/unidoc/common"
    21  )
    22  
    23  var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`)
    24  
    25  // Locates a standard Xref table by looking for the "xref" entry.
    26  // Xref object stream not supported.
    27  func (parser *PdfParser) repairLocateXref() (int64, error) {
    28  	readBuf := int64(1000)
    29  	parser.rs.Seek(-readBuf, os.SEEK_CUR)
    30  
    31  	curOffset, err := parser.rs.Seek(0, os.SEEK_CUR)
    32  	if err != nil {
    33  		return 0, err
    34  	}
    35  	b2 := make([]byte, readBuf)
    36  	parser.rs.Read(b2)
    37  
    38  	results := repairReXrefTable.FindAllStringIndex(string(b2), -1)
    39  	if len(results) < 1 {
    40  		common.Log.Debug("ERROR: Repair: xref not found!")
    41  		return 0, errors.New("Repair: xref not found")
    42  	}
    43  
    44  	localOffset := int64(results[len(results)-1][0])
    45  	xrefOffset := curOffset + localOffset
    46  	return xrefOffset, nil
    47  }
    48  
    49  // Renumbers the xref table.
    50  // Useful when the cross reference is pointing to an object with the wrong number.
    51  // Update the table.
    52  func (parser *PdfParser) rebuildXrefTable() error {
    53  	newXrefs := XrefTable{}
    54  	for objNum, xref := range parser.xrefs {
    55  		obj, _, err := parser.lookupByNumberWrapper(objNum, false)
    56  		if err != nil {
    57  			common.Log.Debug("ERROR: Unable to look up object (%s)", err)
    58  			common.Log.Debug("ERROR: Xref table completely broken - attempting to repair ")
    59  			xrefTable, err := parser.repairRebuildXrefsTopDown()
    60  			if err != nil {
    61  				common.Log.Debug("ERROR: Failed xref rebuild repair (%s)", err)
    62  				return err
    63  			}
    64  			parser.xrefs = *xrefTable
    65  			common.Log.Debug("Repaired xref table built")
    66  			return nil
    67  		}
    68  		actObjNum, actGenNum, err := getObjectNumber(obj)
    69  		if err != nil {
    70  			return err
    71  		}
    72  
    73  		xref.objectNumber = int(actObjNum)
    74  		xref.generation = int(actGenNum)
    75  		newXrefs[int(actObjNum)] = xref
    76  	}
    77  
    78  	parser.xrefs = newXrefs
    79  	common.Log.Debug("New xref table built")
    80  	printXrefTable(parser.xrefs)
    81  	return nil
    82  }
    83  
    84  // Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil).
    85  func parseObjectNumberFromString(str string) (int, int, error) {
    86  	result := reIndirectObject.FindStringSubmatch(str)
    87  	if len(result) < 3 {
    88  		return 0, 0, errors.New("Unable to detect indirect object signature")
    89  	}
    90  
    91  	on, _ := strconv.Atoi(result[1])
    92  	gn, _ := strconv.Atoi(result[2])
    93  
    94  	return on, gn, nil
    95  }
    96  
    97  // Parse the entire file from top down.
    98  // Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns.
    99  // N.B. This collects the XREF_TABLE_ENTRY data only.
   100  func (parser *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
   101  	if parser.repairsAttempted {
   102  		// Avoid multiple repairs (only try once).
   103  		return nil, fmt.Errorf("Repair failed")
   104  	}
   105  	parser.repairsAttempted = true
   106  
   107  	// Go to beginning, reset reader.
   108  	parser.rs.Seek(0, os.SEEK_SET)
   109  	parser.reader = bufio.NewReader(parser.rs)
   110  
   111  	// Keep a running buffer of last bytes.
   112  	bufLen := 20
   113  	last := make([]byte, bufLen)
   114  
   115  	xrefTable := XrefTable{}
   116  	for {
   117  		b, err := parser.reader.ReadByte()
   118  		if err != nil {
   119  			if err == io.EOF {
   120  				break
   121  			} else {
   122  				return nil, err
   123  			}
   124  		}
   125  
   126  		// Format:
   127  		// object number - whitespace - generation number - obj
   128  		// e.g. "12 0 obj"
   129  		if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) {
   130  			i := bufLen - 4
   131  			// Go past whitespace
   132  			for IsWhiteSpace(last[i]) && i > 0 {
   133  				i--
   134  			}
   135  			if i == 0 || !IsDecimalDigit(last[i]) {
   136  				continue
   137  			}
   138  			// Go past generation number
   139  			for IsDecimalDigit(last[i]) && i > 0 {
   140  				i--
   141  			}
   142  			if i == 0 || !IsWhiteSpace(last[i]) {
   143  				continue
   144  			}
   145  			// Go past whitespace
   146  			for IsWhiteSpace(last[i]) && i > 0 {
   147  				i--
   148  			}
   149  			if i == 0 || !IsDecimalDigit(last[i]) {
   150  				continue
   151  			}
   152  			// Go past object number.
   153  			for IsDecimalDigit(last[i]) && i > 0 {
   154  				i--
   155  			}
   156  			if i == 0 {
   157  				continue // Probably too long to be a valid object...
   158  			}
   159  
   160  			objOffset := parser.GetFileOffset() - int64(bufLen-i)
   161  
   162  			objstr := append(last[i+1:], b)
   163  			objNum, genNum, err := parseObjectNumberFromString(string(objstr))
   164  			if err != nil {
   165  				common.Log.Debug("Unable to parse object number: %v", err)
   166  				return nil, err
   167  			}
   168  
   169  			// Create and insert the XREF entry if not existing, or the generation number is higher.
   170  			if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum {
   171  				// Make the entry for the cross ref table.
   172  				xrefEntry := XrefObject{}
   173  				xrefEntry.xtype = XREF_TABLE_ENTRY
   174  				xrefEntry.objectNumber = int(objNum)
   175  				xrefEntry.generation = int(genNum)
   176  				xrefEntry.offset = objOffset
   177  				xrefTable[objNum] = xrefEntry
   178  			}
   179  		}
   180  
   181  		last = append(last[1:bufLen], b)
   182  	}
   183  
   184  	return &xrefTable, nil
   185  }
   186  
   187  // Look for first sign of xref table from end of file.
   188  func (parser *PdfParser) repairSeekXrefMarker() error {
   189  	// Get the file size.
   190  	fSize, err := parser.rs.Seek(0, os.SEEK_END)
   191  	if err != nil {
   192  		return err
   193  	}
   194  
   195  	reXrefTableStart := regexp.MustCompile(`\sxref\s*`)
   196  
   197  	// Define the starting point (from the end of the file) to search from.
   198  	var offset int64 = 0
   199  
   200  	// Define an buffer length in terms of how many bytes to read from the end of the file.
   201  	var buflen int64 = 1000
   202  
   203  	for offset < fSize {
   204  		if fSize <= (buflen + offset) {
   205  			buflen = fSize - offset
   206  		}
   207  
   208  		// Move back enough (as we need to read forward).
   209  		_, err := parser.rs.Seek(-offset-buflen, os.SEEK_END)
   210  		if err != nil {
   211  			return err
   212  		}
   213  
   214  		// Read the data.
   215  		b1 := make([]byte, buflen)
   216  		parser.rs.Read(b1)
   217  
   218  		common.Log.Trace("Looking for xref : \"%s\"", string(b1))
   219  		ind := reXrefTableStart.FindAllStringIndex(string(b1), -1)
   220  		if ind != nil {
   221  			// Found it.
   222  			lastInd := ind[len(ind)-1]
   223  			common.Log.Trace("Ind: % d", ind)
   224  			parser.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
   225  			parser.reader = bufio.NewReader(parser.rs)
   226  			// Go past whitespace, finish at 'x'.
   227  			for {
   228  				bb, err := parser.reader.Peek(1)
   229  				if err != nil {
   230  					return err
   231  				}
   232  				common.Log.Trace("B: %d %c", bb[0], bb[0])
   233  				if !IsWhiteSpace(bb[0]) {
   234  					break
   235  				}
   236  				parser.reader.Discard(1)
   237  			}
   238  
   239  			return nil
   240  		} else {
   241  			common.Log.Debug("Warning: EOF marker not found! - continue seeking")
   242  		}
   243  
   244  		offset += buflen
   245  	}
   246  
   247  	common.Log.Debug("Error: Xref table marker was not found.")
   248  	return errors.New("xref not found ")
   249  }
   250  
   251  // Called when Pdf version not found normally.  Looks for the PDF version by scanning top-down.
   252  // %PDF-1.7
   253  func (parser *PdfParser) seekPdfVersionTopDown() (int, int, error) {
   254  	// Go to beginning, reset reader.
   255  	parser.rs.Seek(0, os.SEEK_SET)
   256  	parser.reader = bufio.NewReader(parser.rs)
   257  
   258  	// Keep a running buffer of last bytes.
   259  	bufLen := 20
   260  	last := make([]byte, bufLen)
   261  
   262  	for {
   263  		b, err := parser.reader.ReadByte()
   264  		if err != nil {
   265  			if err == io.EOF {
   266  				break
   267  			} else {
   268  				return 0, 0, err
   269  			}
   270  		}
   271  
   272  		// Format:
   273  		// object number - whitespace - generation number - obj
   274  		// e.g. "12 0 obj"
   275  		if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
   276  			last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
   277  			major := int(last[bufLen-2] - '0')
   278  			minor := int(b - '0')
   279  			return major, minor, nil
   280  		}
   281  
   282  		last = append(last[1:bufLen], b)
   283  	}
   284  
   285  	return 0, 0, errors.New("Version not found")
   286  }