github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/internal/cmap/cmap_test.go (about)

     1  /*
     2   * This file is subject to the terms and conditions defined in
     3   * file 'LICENSE.md', which is part of this source code package.
     4   */
     5  
     6  package cmap
     7  
     8  import (
     9  	"testing"
    10  )
    11  
    12  func init() {
    13  	// Uncomment when debugging to get debug or trace logging output.
    14  	//common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug))
    15  	//common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
    16  }
    17  
    18  // cmap1Data represents a basic CMap.
    19  const cmap1Data = `
    20  /CIDInit /ProcSet findresource begin
    21  12 dict begin
    22  begincmap
    23  /CIDSystemInfo
    24  <<  /Registry (Adobe)
    25  /Ordering (UCS)
    26  /Supplement 0
    27  >> def
    28  /CMapName /Adobe-Identity-UCS def
    29  /CMapType 2 def
    30  1 begincodespacerange
    31  <0000> <FFFF>
    32  endcodespacerange
    33  8 beginbfchar
    34  <0003> <0020>
    35  <0007> <0024>
    36  <0033> <0050>
    37  <0035> <0052>
    38  <0037> <0054>
    39  <005A> <0077>
    40  <005C> <0079>
    41  <005F> <007C>
    42  endbfchar
    43  7 beginbfrange
    44  <000F> <0017> <002C>
    45  <001B> <001D> <0038>
    46  <0025> <0026> <0042>
    47  <002F> <0031> <004C>
    48  <0044> <004C> <0061>
    49  <004F> <0053> <006C>
    50  <0055> <0057> <0072>
    51  endbfrange
    52  endcmap
    53  CMapName currentdict /CMap defineresource pop
    54  end
    55  end
    56  `
    57  
    58  // TestCMapParser tests basic loading of a simple CMap.
    59  func TestCMapParser1(t *testing.T) {
    60  	cmap, err := LoadCmapFromData([]byte(cmap1Data))
    61  	if err != nil {
    62  		t.Error("Failed: ", err)
    63  		return
    64  	}
    65  
    66  	if cmap.Name() != "Adobe-Identity-UCS" {
    67  		t.Errorf("CMap name incorrect (%s)", cmap.Name())
    68  		return
    69  	}
    70  
    71  	if cmap.Type() != 2 {
    72  		t.Errorf("CMap type incorrect")
    73  		return
    74  	}
    75  
    76  	if len(cmap.codespaces) != 1 {
    77  		t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
    78  		return
    79  	}
    80  
    81  	if cmap.codespaces[0].low != 0 {
    82  		t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].low)
    83  		return
    84  	}
    85  
    86  	if cmap.codespaces[0].high != 0xFFFF {
    87  		t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].high)
    88  		return
    89  	}
    90  
    91  	expectedMappings := map[uint64]rune{
    92  		0x0003:     0x0020,
    93  		0x005F:     0x007C,
    94  		0x000F:     0x002C,
    95  		0x000F + 5: 0x002C + 5,
    96  		0x001B:     0x0038,
    97  		0x001B + 2: 0x0038 + 2,
    98  		0x002F:     0x004C,
    99  		0x0044:     0x0061,
   100  		0x004F:     0x006C,
   101  		0x0055:     0x0072,
   102  	}
   103  
   104  	for k, expected := range expectedMappings {
   105  		if v := cmap.CharcodeToUnicode(k); v != string(expected) {
   106  			t.Errorf("incorrect mapping, expecting 0x%X -> 0x%X (%#v)", k, expected, v)
   107  			return
   108  		}
   109  	}
   110  
   111  	v := cmap.CharcodeToUnicode(0x99)
   112  	if v != "?" { //!= "notdef" {
   113  		t.Errorf("Unmapped code, expected to map to undefined")
   114  		return
   115  	}
   116  
   117  	charcodes := []byte{0x00, 0x03, 0x00, 0x0F}
   118  	s := cmap.CharcodeBytesToUnicode(charcodes)
   119  	if s != " ," {
   120  		t.Error("Incorrect charcode bytes -> string mapping")
   121  		return
   122  	}
   123  }
   124  
   125  const cmap2Data = `
   126  /CIDInit /ProcSet findresource begin
   127  12 dict begin
   128  begincmap
   129  /CIDSystemInfo
   130  <<  /Registry (Adobe)
   131  /Ordering (UCS)
   132  /Supplement 0
   133  >> def
   134  /CMapName /Adobe-Identity-UCS def
   135  /CMapType 2 def
   136  1 begincodespacerange
   137  <0000> <FFFF>
   138  endcodespacerange
   139  7 beginbfrange
   140  <0080> <00FF> <002C>
   141  <802F> <902F> <0038>
   142  endbfrange
   143  endcmap
   144  CMapName currentdict /CMap defineresource pop
   145  end
   146  end
   147  `
   148  
   149  // TestCMapParser2 tests a bug that came up when 2-byte character codes had the higher byte set to 0,
   150  // e.g. 0x0080, and the character map was not taking the number of bytes of the input codemap into account.
   151  func TestCMapParser2(t *testing.T) {
   152  	//common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
   153  
   154  	cmap, err := LoadCmapFromData([]byte(cmap2Data))
   155  	if err != nil {
   156  		t.Error("Failed: ", err)
   157  		return
   158  	}
   159  
   160  	if cmap.Name() != "Adobe-Identity-UCS" {
   161  		t.Errorf("CMap name incorrect (%s)", cmap.Name())
   162  		return
   163  	}
   164  
   165  	if cmap.Type() != 2 {
   166  		t.Errorf("CMap type incorrect")
   167  		return
   168  	}
   169  
   170  	if len(cmap.codespaces) != 1 {
   171  		t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
   172  		return
   173  	}
   174  
   175  	if cmap.codespaces[0].low != 0 {
   176  		t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].low)
   177  		return
   178  	}
   179  
   180  	if cmap.codespaces[0].high != 0xFFFF {
   181  		t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].high)
   182  		return
   183  	}
   184  
   185  	expectedMappings := map[uint64]rune{
   186  		0x0080: 0x002C,
   187  		0x802F: 0x0038,
   188  	}
   189  
   190  	for k, expected := range expectedMappings {
   191  		if v := cmap.CharcodeToUnicode(k); v != string(expected) {
   192  			t.Errorf("incorrect mapping, expecting 0x%X -> 0x%X (got 0x%X)", k, expected, v)
   193  			return
   194  		}
   195  	}
   196  
   197  	// Check byte sequence mappings.
   198  	excpectedSequenceMappings := []struct {
   199  		bytes    []byte
   200  		expected string
   201  	}{
   202  		{[]byte{0x80, 0x2F, 0x00, 0x80}, string([]rune{0x0038, 0x002C})},
   203  	}
   204  
   205  	for _, exp := range excpectedSequenceMappings {
   206  		str := cmap.CharcodeBytesToUnicode(exp.bytes)
   207  		if str != exp.expected {
   208  			t.Errorf("Incorrect byte sequence mapping -> % X -> % X (got % X)", exp.bytes, []rune(exp.expected), []rune(str))
   209  			return
   210  		}
   211  	}
   212  }
   213  
   214  // cmapData3 is a CMap with a mixture of 1 and 2 byte codespaces.
   215  const cmapData3 = `
   216  /CIDInit /ProcSet findresource begin
   217  12 dict begin begincmap
   218  /CIDSystemInfo
   219  3 dict dup begin
   220  /Registry (Adobe) def
   221  /Supplement 2 def
   222  end def
   223  
   224  /CMapName /test-1 def
   225  /CMapType 1 def
   226  
   227  4 begincodespacerange
   228  <00> <80>
   229  <8100> <9fff>
   230  <a0> <df>
   231  <d040> <fbfc>
   232  endcodespacerange
   233  7 beginbfrange
   234  <00> <80> <10>
   235  <8100> <9f00> <1000>
   236  <a0> <d0> <90>
   237  <d140> <f000> <a000>
   238  endbfrange
   239  endcmap
   240  `
   241  
   242  // TestCMapParser3 test case of a CMap with mixed number of 1 and 2 bytes in the codespace range.
   243  func TestCMapParser3(t *testing.T) {
   244  	//common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
   245  
   246  	cmap, err := LoadCmapFromData([]byte(cmapData3))
   247  	if err != nil {
   248  		t.Error("Failed: ", err)
   249  		return
   250  	}
   251  
   252  	if cmap.Name() != "test-1" {
   253  		t.Errorf("CMap name incorrect (%s)", cmap.Name())
   254  		return
   255  	}
   256  
   257  	if cmap.Type() != 1 {
   258  		t.Errorf("CMap type incorrect")
   259  		return
   260  	}
   261  
   262  	// Check codespaces.
   263  	expectedCodespaces := []struct {
   264  		numBytes int
   265  		low      uint64
   266  		high     uint64
   267  	}{
   268  		{1, 0x00, 0x80},
   269  		{2, 0x8100, 0x9fff},
   270  		{1, 0xa0, 0xdf},
   271  		{2, 0xd040, 0xfbfc},
   272  	}
   273  
   274  	if len(cmap.codespaces) != len(expectedCodespaces) {
   275  		t.Errorf("len codespace != %d (%d)", len(expectedCodespaces), len(cmap.codespaces))
   276  		return
   277  	}
   278  
   279  	for i, cs := range cmap.codespaces {
   280  		exp := expectedCodespaces[i]
   281  		if cs.numBytes != exp.numBytes {
   282  			t.Errorf("code space number of bytes != %d (%d)", exp.numBytes, cs.numBytes)
   283  			return
   284  		}
   285  
   286  		if cs.low != exp.low {
   287  			t.Errorf("code space low range != %d (%d)", exp.low, cs.low)
   288  			return
   289  		}
   290  
   291  		if cs.high != exp.high {
   292  			t.Errorf("code space high range != 0x%X (0x%X)", exp.high, cs.high)
   293  			return
   294  		}
   295  	}
   296  
   297  	// Check mappings.
   298  	expectedMappings := map[uint64]rune{
   299  		0x0080: 0x10 + 0x80,
   300  		0x8100: 0x1000,
   301  		0x00a0: 0x90,
   302  		0xd140: 0xa000,
   303  	}
   304  	for k, expected := range expectedMappings {
   305  		if v := cmap.CharcodeToUnicode(k); v != string(expected) {
   306  			t.Errorf("incorrect mapping, expecting 0x%X -> 0x%X (got 0x%X)", k, expected, v)
   307  			return
   308  		}
   309  	}
   310  
   311  	// Check byte sequence mappings.
   312  	excpectedSequenceMappings := []struct {
   313  		bytes    []byte
   314  		expected string
   315  	}{
   316  		{[]byte{0x80, 0x81, 0x00, 0xa1, 0xd1, 0x80, 0x00}, string([]rune{0x90, 0x1000, 0x91, 0xa000 + 0x40, 0x10})},
   317  	}
   318  
   319  	for _, exp := range excpectedSequenceMappings {
   320  		str := cmap.CharcodeBytesToUnicode(exp.bytes)
   321  		if str != exp.expected {
   322  			t.Errorf("Incorrect byte sequence mapping -> % X -> % X (got % X)", exp.bytes, []rune(exp.expected), []rune(str))
   323  			return
   324  		}
   325  	}
   326  }