github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/core/utils.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package core
     7  
     8  import (
     9  	"errors"
    10  	"fmt"
    11  	"sort"
    12  
    13  	"github.com/unidoc/unidoc/common"
    14  )
    15  
    16  // Check slice range to make sure within bounds for accessing:
    17  //    slice[a:b] where sliceLen=len(slice).
    18  func checkBounds(sliceLen, a, b int) error {
    19  	if a < 0 || a > sliceLen {
    20  		return errors.New("Slice index a out of bounds")
    21  	}
    22  	if b < a {
    23  		return errors.New("Invalid slice index b < a")
    24  	}
    25  	if b > sliceLen {
    26  		return errors.New("Slice index b out of bounds")
    27  	}
    28  
    29  	return nil
    30  }
    31  
    32  // Inspect analyzes the document object structure.
    33  func (parser *PdfParser) Inspect() (map[string]int, error) {
    34  	return parser.inspect()
    35  }
    36  
    37  // GetObjectNums returns a sorted list of object numbers of the PDF objects in the file.
    38  func (parser *PdfParser) GetObjectNums() []int {
    39  	objNums := []int{}
    40  	for _, x := range parser.xrefs {
    41  		objNums = append(objNums, x.objectNumber)
    42  	}
    43  
    44  	// Sort the object numbers to give consistent ordering of PDF objects in output.
    45  	// Needed since parser.xrefs is a map.
    46  	sort.Ints(objNums)
    47  
    48  	return objNums
    49  }
    50  
    51  func getUniDocVersion() string {
    52  	return common.Version
    53  }
    54  
    55  /*
    56   * Inspect object types.
    57   * Go through all objects in the cross ref table and detect the types.
    58   * Mostly for debugging purposes and inspecting odd PDF files.
    59   */
    60  func (parser *PdfParser) inspect() (map[string]int, error) {
    61  	common.Log.Trace("--------INSPECT ----------")
    62  	common.Log.Trace("Xref table:")
    63  
    64  	objTypes := map[string]int{}
    65  	objCount := 0
    66  	failedCount := 0
    67  
    68  	keys := []int{}
    69  	for k := range parser.xrefs {
    70  		keys = append(keys, k)
    71  	}
    72  	sort.Ints(keys)
    73  
    74  	i := 0
    75  	for _, k := range keys {
    76  		xref := parser.xrefs[k]
    77  		if xref.objectNumber == 0 {
    78  			continue
    79  		}
    80  		objCount++
    81  		common.Log.Trace("==========")
    82  		common.Log.Trace("Looking up object number: %d", xref.objectNumber)
    83  		o, err := parser.LookupByNumber(xref.objectNumber)
    84  		if err != nil {
    85  			common.Log.Trace("ERROR: Fail to lookup obj %d (%s)", xref.objectNumber, err)
    86  			failedCount++
    87  			continue
    88  		}
    89  
    90  		common.Log.Trace("obj: %s", o)
    91  
    92  		iobj, isIndirect := o.(*PdfIndirectObject)
    93  		if isIndirect {
    94  			common.Log.Trace("IND OOBJ %d: %s", xref.objectNumber, iobj)
    95  			dict, isDict := iobj.PdfObject.(*PdfObjectDictionary)
    96  			if isDict {
    97  				// Check if has Type parameter.
    98  				if ot, has := dict.Get("Type").(*PdfObjectName); has {
    99  					otype := string(*ot)
   100  					common.Log.Trace("---> Obj type: %s", otype)
   101  					_, isDefined := objTypes[otype]
   102  					if isDefined {
   103  						objTypes[otype]++
   104  					} else {
   105  						objTypes[otype] = 1
   106  					}
   107  				} else if ot, has := dict.Get("Subtype").(*PdfObjectName); has {
   108  					// Check if subtype
   109  					otype := string(*ot)
   110  					common.Log.Trace("---> Obj subtype: %s", otype)
   111  					_, isDefined := objTypes[otype]
   112  					if isDefined {
   113  						objTypes[otype]++
   114  					} else {
   115  						objTypes[otype] = 1
   116  					}
   117  				}
   118  				if val, has := dict.Get("S").(*PdfObjectName); has && *val == "JavaScript" {
   119  					// Check if Javascript.
   120  					_, isDefined := objTypes["JavaScript"]
   121  					if isDefined {
   122  						objTypes["JavaScript"]++
   123  					} else {
   124  						objTypes["JavaScript"] = 1
   125  					}
   126  				}
   127  
   128  			}
   129  		} else if sobj, isStream := o.(*PdfObjectStream); isStream {
   130  			if otype, ok := sobj.PdfObjectDictionary.Get("Type").(*PdfObjectName); ok {
   131  				common.Log.Trace("--> Stream object type: %s", *otype)
   132  				k := string(*otype)
   133  				if _, isDefined := objTypes[k]; isDefined {
   134  					objTypes[k]++
   135  				} else {
   136  					objTypes[k] = 1
   137  				}
   138  			}
   139  		} else { // Direct.
   140  			dict, isDict := o.(*PdfObjectDictionary)
   141  			if isDict {
   142  				ot, isName := dict.Get("Type").(*PdfObjectName)
   143  				if isName {
   144  					otype := string(*ot)
   145  					common.Log.Trace("--- obj type %s", otype)
   146  					objTypes[otype]++
   147  				}
   148  			}
   149  			common.Log.Trace("DIRECT OBJ %d: %s", xref.objectNumber, o)
   150  		}
   151  
   152  		i++
   153  	}
   154  	common.Log.Trace("--------EOF INSPECT ----------")
   155  	common.Log.Trace("=======")
   156  	common.Log.Trace("Object count: %d", objCount)
   157  	common.Log.Trace("Failed lookup: %d", failedCount)
   158  	for t, c := range objTypes {
   159  		common.Log.Trace("%s: %d", t, c)
   160  	}
   161  	common.Log.Trace("=======")
   162  
   163  	if len(parser.xrefs) < 1 {
   164  		common.Log.Debug("ERROR: This document is invalid (xref table missing!)")
   165  		return nil, fmt.Errorf("Invalid document (xref table missing)")
   166  	}
   167  
   168  	fontObjs, ok := objTypes["Font"]
   169  	if !ok || fontObjs < 2 {
   170  		common.Log.Trace("This document is probably scanned!")
   171  	} else {
   172  		common.Log.Trace("This document is valid for extraction!")
   173  	}
   174  
   175  	return objTypes, nil
   176  }
   177  
   178  func absInt(x int) int {
   179  	if x < 0 {
   180  		return -x
   181  	} else {
   182  		return x
   183  	}
   184  }