github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/internal/cmap/cmap.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package cmap
     7  
     8  import (
     9  	"bytes"
    10  	"errors"
    11  	"io"
    12  
    13  	"github.com/unidoc/unidoc/common"
    14  	"github.com/unidoc/unidoc/pdf/model/textencoding"
    15  )
    16  
    17  // CMap represents a character code to unicode mapping used in PDF files.
    18  type CMap struct {
    19  	*cMapParser
    20  
    21  	// Text encoder to look up runes from input glyph names.
    22  	encoder textencoding.TextEncoder
    23  
    24  	// map of character code to string (sequence of runes) for 1-4 byte codes separately.
    25  	codeMap [4]map[uint64]string
    26  
    27  	name       string
    28  	ctype      int
    29  	codespaces []codespace
    30  }
    31  
    32  // codespace represents a single codespace range used in the CMap.
    33  type codespace struct {
    34  	numBytes int
    35  	low      uint64
    36  	high     uint64
    37  }
    38  
    39  // Name returns the name of the CMap.
    40  func (cmap *CMap) Name() string {
    41  	return cmap.name
    42  }
    43  
    44  // Type returns the type of the CMap.
    45  func (cmap *CMap) Type() int {
    46  	return cmap.ctype
    47  }
    48  
    49  // CharcodeBytesToUnicode converts a byte array of charcodes to a unicode string representation.
    50  func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string {
    51  	var buf bytes.Buffer
    52  
    53  	// Maximum number of possible bytes per code.
    54  	maxLen := 4
    55  
    56  	i := 0
    57  	for i < len(src) {
    58  		var code uint64
    59  		var j int
    60  		for j = 0; j < maxLen && i+j < len(src); j++ {
    61  			b := src[i+j]
    62  
    63  			code <<= 8
    64  			code |= uint64(b)
    65  
    66  			tgt, has := cmap.codeMap[j][code]
    67  			if has {
    68  				buf.WriteString(tgt)
    69  				break
    70  			} else if j == maxLen-1 || i+j == len(src)-1 {
    71  				break
    72  			}
    73  		}
    74  		i += j + 1
    75  	}
    76  
    77  	return buf.String()
    78  }
    79  
    80  // CharcodeToUnicode converts a single character code to unicode string.
    81  // Note that CharcodeBytesToUnicode is typically more efficient.
    82  func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string {
    83  	// Search through different code lengths.
    84  	for numBytes := 1; numBytes <= 4; numBytes++ {
    85  		if c, has := cmap.codeMap[numBytes-1][srcCode]; has {
    86  			return c
    87  		}
    88  	}
    89  
    90  	// Not found.
    91  	return "?"
    92  }
    93  
    94  // newCMap returns an initialized CMap.
    95  func newCMap() *CMap {
    96  	cmap := &CMap{}
    97  	cmap.codespaces = []codespace{}
    98  	cmap.codeMap = [4]map[uint64]string{}
    99  	// Maps for 1-4 bytes are initialized. Minimal overhead if not used (most commonly used are 1-2 bytes).
   100  	cmap.codeMap[0] = map[uint64]string{}
   101  	cmap.codeMap[1] = map[uint64]string{}
   102  	cmap.codeMap[2] = map[uint64]string{}
   103  	cmap.codeMap[3] = map[uint64]string{}
   104  	return cmap
   105  }
   106  
   107  // LoadCmapFromData parses CMap data in memory through a byte vector and returns a CMap which
   108  // can be used for character code to unicode conversion.
   109  func LoadCmapFromData(data []byte) (*CMap, error) {
   110  	cmap := newCMap()
   111  	cmap.cMapParser = newCMapParser(data)
   112  
   113  	err := cmap.parse()
   114  	if err != nil {
   115  		return cmap, err
   116  	}
   117  
   118  	return cmap, nil
   119  }
   120  
   121  // parse parses the CMap file and loads into the CMap structure.
   122  func (cmap *CMap) parse() error {
   123  	for {
   124  		o, err := cmap.parseObject()
   125  		if err != nil {
   126  			if err == io.EOF {
   127  				break
   128  			}
   129  
   130  			common.Log.Debug("Error parsing CMap: %v", err)
   131  			return err
   132  		}
   133  
   134  		if op, isOp := o.(cmapOperand); isOp {
   135  			common.Log.Trace("Operand: %s", op.Operand)
   136  
   137  			if op.Operand == begincodespacerange {
   138  				err := cmap.parseCodespaceRange()
   139  				if err != nil {
   140  					return err
   141  				}
   142  			} else if op.Operand == beginbfchar {
   143  				err := cmap.parseBfchar()
   144  				if err != nil {
   145  					return err
   146  				}
   147  			} else if op.Operand == beginbfrange {
   148  				err := cmap.parseBfrange()
   149  				if err != nil {
   150  					return err
   151  				}
   152  			}
   153  		} else if n, isName := o.(cmapName); isName {
   154  			if n.Name == cmapname {
   155  				o, err := cmap.parseObject()
   156  				if err != nil {
   157  					if err == io.EOF {
   158  						break
   159  					}
   160  					return err
   161  				}
   162  				name, ok := o.(cmapName)
   163  				if !ok {
   164  					return errors.New("CMap name not a name")
   165  				}
   166  				cmap.name = name.Name
   167  			} else if n.Name == cmaptype {
   168  				o, err := cmap.parseObject()
   169  				if err != nil {
   170  					if err == io.EOF {
   171  						break
   172  					}
   173  					return err
   174  				}
   175  				typeInt, ok := o.(cmapInt)
   176  				if !ok {
   177  					return errors.New("CMap type not an integer")
   178  				}
   179  				cmap.ctype = int(typeInt.val)
   180  			}
   181  		} else {
   182  			common.Log.Trace("Unhandled object: %T %#v", o, o)
   183  		}
   184  	}
   185  
   186  	return nil
   187  }
   188  
   189  // parseCodespaceRange parses the codespace range section of a CMap.
   190  func (cmap *CMap) parseCodespaceRange() error {
   191  	for {
   192  		o, err := cmap.parseObject()
   193  		if err != nil {
   194  			if err == io.EOF {
   195  				break
   196  			}
   197  			return err
   198  		}
   199  
   200  		hexLow, isHex := o.(cmapHexString)
   201  		if !isHex {
   202  			if op, isOperand := o.(cmapOperand); isOperand {
   203  				if op.Operand == endcodespacerange {
   204  					return nil
   205  				}
   206  				return errors.New("Unexpected operand")
   207  			}
   208  		}
   209  
   210  		o, err = cmap.parseObject()
   211  		if err != nil {
   212  			if err == io.EOF {
   213  				break
   214  			}
   215  			return err
   216  		}
   217  		hexHigh, ok := o.(cmapHexString)
   218  		if !ok {
   219  			return errors.New("Non-hex high")
   220  		}
   221  
   222  		if hexLow.numBytes != hexHigh.numBytes {
   223  			return errors.New("Unequal number of bytes in range")
   224  		}
   225  
   226  		low := hexToUint64(hexLow)
   227  		high := hexToUint64(hexHigh)
   228  		numBytes := hexLow.numBytes
   229  
   230  		cspace := codespace{numBytes: numBytes, low: low, high: high}
   231  		cmap.codespaces = append(cmap.codespaces, cspace)
   232  
   233  		common.Log.Trace("Codespace low: 0x%X, high: 0x%X", low, high)
   234  	}
   235  
   236  	return nil
   237  }
   238  
   239  // parseBfchar parses a bfchar section of a CMap file.
   240  func (cmap *CMap) parseBfchar() error {
   241  	for {
   242  		// Src code.
   243  		o, err := cmap.parseObject()
   244  		if err != nil {
   245  			if err == io.EOF {
   246  				break
   247  			}
   248  			return err
   249  		}
   250  		var srcCode uint64
   251  		var numBytes int
   252  
   253  		switch v := o.(type) {
   254  		case cmapOperand:
   255  			if v.Operand == endbfchar {
   256  				return nil
   257  			}
   258  			return errors.New("Unexpected operand")
   259  		case cmapHexString:
   260  			srcCode = hexToUint64(v)
   261  			numBytes = v.numBytes
   262  		default:
   263  			return errors.New("Unexpected type")
   264  		}
   265  
   266  		// Target code.
   267  		o, err = cmap.parseObject()
   268  		if err != nil {
   269  			if err == io.EOF {
   270  				break
   271  			}
   272  			return err
   273  		}
   274  		var toCode string
   275  
   276  		switch v := o.(type) {
   277  		case cmapOperand:
   278  			if v.Operand == endbfchar {
   279  				return nil
   280  			}
   281  			return errors.New("Unexpected operand")
   282  		case cmapHexString:
   283  			toCode = hexToString(v)
   284  		case cmapName:
   285  			toCode = "?"
   286  			if cmap.encoder != nil {
   287  				if r, found := cmap.encoder.GlyphToRune(v.Name); found {
   288  					toCode = string(r)
   289  				}
   290  			}
   291  		default:
   292  			return errors.New("Unexpected type")
   293  		}
   294  
   295  		if numBytes <= 0 || numBytes > 4 {
   296  			return errors.New("Invalid code length")
   297  		}
   298  
   299  		cmap.codeMap[numBytes-1][srcCode] = toCode
   300  	}
   301  
   302  	return nil
   303  }
   304  
   305  // parseBfrange parses a bfrange section of a CMap file.
   306  func (cmap *CMap) parseBfrange() error {
   307  	for {
   308  		// The specifications are in pairs of 3.
   309  		// <srcCodeFrom> <srcCodeTo> <target>
   310  		// where target can be either <destFrom> as a hex code, or a list.
   311  
   312  		// Src code from.
   313  		var srcCodeFrom uint64
   314  		var numBytes int
   315  		{
   316  			o, err := cmap.parseObject()
   317  			if err != nil {
   318  				if err == io.EOF {
   319  					break
   320  				}
   321  				return err
   322  			}
   323  
   324  			switch v := o.(type) {
   325  			case cmapOperand:
   326  				if v.Operand == endbfrange {
   327  					return nil
   328  				}
   329  				return errors.New("Unexpected operand")
   330  			case cmapHexString:
   331  				srcCodeFrom = hexToUint64(v)
   332  				numBytes = v.numBytes
   333  			default:
   334  				return errors.New("Unexpected type")
   335  			}
   336  		}
   337  
   338  		// Src code to.
   339  		var srcCodeTo uint64
   340  		{
   341  			o, err := cmap.parseObject()
   342  			if err != nil {
   343  				if err == io.EOF {
   344  					break
   345  				}
   346  				return err
   347  			}
   348  
   349  			switch v := o.(type) {
   350  			case cmapOperand:
   351  				if v.Operand == endbfrange {
   352  					return nil
   353  				}
   354  				return errors.New("Unexpected operand")
   355  			case cmapHexString:
   356  				srcCodeTo = hexToUint64(v)
   357  			default:
   358  				return errors.New("Unexpected type")
   359  			}
   360  		}
   361  
   362  		// target(s).
   363  		o, err := cmap.parseObject()
   364  		if err != nil {
   365  			if err == io.EOF {
   366  				break
   367  			}
   368  			return err
   369  		}
   370  
   371  		if numBytes <= 0 || numBytes > 4 {
   372  			return errors.New("Invalid code length")
   373  		}
   374  
   375  		switch v := o.(type) {
   376  		case cmapArray:
   377  			sc := srcCodeFrom
   378  			for _, o := range v.Array {
   379  				hexs, ok := o.(cmapHexString)
   380  				if !ok {
   381  					return errors.New("Non-hex string in array")
   382  				}
   383  				cmap.codeMap[numBytes-1][sc] = hexToString(hexs)
   384  				sc++
   385  			}
   386  			if sc != srcCodeTo+1 {
   387  				return errors.New("Invalid number of items in array")
   388  			}
   389  		case cmapHexString:
   390  			// <srcCodeFrom> <srcCodeTo> <dstCode>, maps [from,to] to [dstCode,dstCode+to-from].
   391  			// in hex format.
   392  			target := hexToUint64(v)
   393  			i := uint64(0)
   394  			for sc := srcCodeFrom; sc <= srcCodeTo; sc++ {
   395  				r := target + i
   396  				cmap.codeMap[numBytes-1][sc] = string(r)
   397  				i++
   398  			}
   399  		default:
   400  			return errors.New("Unexpected type")
   401  		}
   402  	}
   403  
   404  	return nil
   405  }