github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/internal/cmap/cmap.go (about) 1 /* 2 * This file is subject to the terms and conditions defined in 3 * file 'LICENSE.md', which is part of this source code package. 4 */ 5 6 package cmap 7 8 import ( 9 "bytes" 10 "errors" 11 "io" 12 13 "github.com/unidoc/unidoc/common" 14 "github.com/unidoc/unidoc/pdf/model/textencoding" 15 ) 16 17 // CMap represents a character code to unicode mapping used in PDF files. 18 type CMap struct { 19 *cMapParser 20 21 // Text encoder to look up runes from input glyph names. 22 encoder textencoding.TextEncoder 23 24 // map of character code to string (sequence of runes) for 1-4 byte codes separately. 25 codeMap [4]map[uint64]string 26 27 name string 28 ctype int 29 codespaces []codespace 30 } 31 32 // codespace represents a single codespace range used in the CMap. 33 type codespace struct { 34 numBytes int 35 low uint64 36 high uint64 37 } 38 39 // Name returns the name of the CMap. 40 func (cmap *CMap) Name() string { 41 return cmap.name 42 } 43 44 // Type returns the type of the CMap. 45 func (cmap *CMap) Type() int { 46 return cmap.ctype 47 } 48 49 // CharcodeBytesToUnicode converts a byte array of charcodes to a unicode string representation. 50 func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string { 51 var buf bytes.Buffer 52 53 // Maximum number of possible bytes per code. 54 maxLen := 4 55 56 i := 0 57 for i < len(src) { 58 var code uint64 59 var j int 60 for j = 0; j < maxLen && i+j < len(src); j++ { 61 b := src[i+j] 62 63 code <<= 8 64 code |= uint64(b) 65 66 tgt, has := cmap.codeMap[j][code] 67 if has { 68 buf.WriteString(tgt) 69 break 70 } else if j == maxLen-1 || i+j == len(src)-1 { 71 break 72 } 73 } 74 i += j + 1 75 } 76 77 return buf.String() 78 } 79 80 // CharcodeToUnicode converts a single character code to unicode string. 81 // Note that CharcodeBytesToUnicode is typically more efficient. 82 func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string { 83 // Search through different code lengths. 84 for numBytes := 1; numBytes <= 4; numBytes++ { 85 if c, has := cmap.codeMap[numBytes-1][srcCode]; has { 86 return c 87 } 88 } 89 90 // Not found. 91 return "?" 92 } 93 94 // newCMap returns an initialized CMap. 95 func newCMap() *CMap { 96 cmap := &CMap{} 97 cmap.codespaces = []codespace{} 98 cmap.codeMap = [4]map[uint64]string{} 99 // Maps for 1-4 bytes are initialized. Minimal overhead if not used (most commonly used are 1-2 bytes). 100 cmap.codeMap[0] = map[uint64]string{} 101 cmap.codeMap[1] = map[uint64]string{} 102 cmap.codeMap[2] = map[uint64]string{} 103 cmap.codeMap[3] = map[uint64]string{} 104 return cmap 105 } 106 107 // LoadCmapFromData parses CMap data in memory through a byte vector and returns a CMap which 108 // can be used for character code to unicode conversion. 109 func LoadCmapFromData(data []byte) (*CMap, error) { 110 cmap := newCMap() 111 cmap.cMapParser = newCMapParser(data) 112 113 err := cmap.parse() 114 if err != nil { 115 return cmap, err 116 } 117 118 return cmap, nil 119 } 120 121 // parse parses the CMap file and loads into the CMap structure. 122 func (cmap *CMap) parse() error { 123 for { 124 o, err := cmap.parseObject() 125 if err != nil { 126 if err == io.EOF { 127 break 128 } 129 130 common.Log.Debug("Error parsing CMap: %v", err) 131 return err 132 } 133 134 if op, isOp := o.(cmapOperand); isOp { 135 common.Log.Trace("Operand: %s", op.Operand) 136 137 if op.Operand == begincodespacerange { 138 err := cmap.parseCodespaceRange() 139 if err != nil { 140 return err 141 } 142 } else if op.Operand == beginbfchar { 143 err := cmap.parseBfchar() 144 if err != nil { 145 return err 146 } 147 } else if op.Operand == beginbfrange { 148 err := cmap.parseBfrange() 149 if err != nil { 150 return err 151 } 152 } 153 } else if n, isName := o.(cmapName); isName { 154 if n.Name == cmapname { 155 o, err := cmap.parseObject() 156 if err != nil { 157 if err == io.EOF { 158 break 159 } 160 return err 161 } 162 name, ok := o.(cmapName) 163 if !ok { 164 return errors.New("CMap name not a name") 165 } 166 cmap.name = name.Name 167 } else if n.Name == cmaptype { 168 o, err := cmap.parseObject() 169 if err != nil { 170 if err == io.EOF { 171 break 172 } 173 return err 174 } 175 typeInt, ok := o.(cmapInt) 176 if !ok { 177 return errors.New("CMap type not an integer") 178 } 179 cmap.ctype = int(typeInt.val) 180 } 181 } else { 182 common.Log.Trace("Unhandled object: %T %#v", o, o) 183 } 184 } 185 186 return nil 187 } 188 189 // parseCodespaceRange parses the codespace range section of a CMap. 190 func (cmap *CMap) parseCodespaceRange() error { 191 for { 192 o, err := cmap.parseObject() 193 if err != nil { 194 if err == io.EOF { 195 break 196 } 197 return err 198 } 199 200 hexLow, isHex := o.(cmapHexString) 201 if !isHex { 202 if op, isOperand := o.(cmapOperand); isOperand { 203 if op.Operand == endcodespacerange { 204 return nil 205 } 206 return errors.New("Unexpected operand") 207 } 208 } 209 210 o, err = cmap.parseObject() 211 if err != nil { 212 if err == io.EOF { 213 break 214 } 215 return err 216 } 217 hexHigh, ok := o.(cmapHexString) 218 if !ok { 219 return errors.New("Non-hex high") 220 } 221 222 if hexLow.numBytes != hexHigh.numBytes { 223 return errors.New("Unequal number of bytes in range") 224 } 225 226 low := hexToUint64(hexLow) 227 high := hexToUint64(hexHigh) 228 numBytes := hexLow.numBytes 229 230 cspace := codespace{numBytes: numBytes, low: low, high: high} 231 cmap.codespaces = append(cmap.codespaces, cspace) 232 233 common.Log.Trace("Codespace low: 0x%X, high: 0x%X", low, high) 234 } 235 236 return nil 237 } 238 239 // parseBfchar parses a bfchar section of a CMap file. 240 func (cmap *CMap) parseBfchar() error { 241 for { 242 // Src code. 243 o, err := cmap.parseObject() 244 if err != nil { 245 if err == io.EOF { 246 break 247 } 248 return err 249 } 250 var srcCode uint64 251 var numBytes int 252 253 switch v := o.(type) { 254 case cmapOperand: 255 if v.Operand == endbfchar { 256 return nil 257 } 258 return errors.New("Unexpected operand") 259 case cmapHexString: 260 srcCode = hexToUint64(v) 261 numBytes = v.numBytes 262 default: 263 return errors.New("Unexpected type") 264 } 265 266 // Target code. 267 o, err = cmap.parseObject() 268 if err != nil { 269 if err == io.EOF { 270 break 271 } 272 return err 273 } 274 var toCode string 275 276 switch v := o.(type) { 277 case cmapOperand: 278 if v.Operand == endbfchar { 279 return nil 280 } 281 return errors.New("Unexpected operand") 282 case cmapHexString: 283 toCode = hexToString(v) 284 case cmapName: 285 toCode = "?" 286 if cmap.encoder != nil { 287 if r, found := cmap.encoder.GlyphToRune(v.Name); found { 288 toCode = string(r) 289 } 290 } 291 default: 292 return errors.New("Unexpected type") 293 } 294 295 if numBytes <= 0 || numBytes > 4 { 296 return errors.New("Invalid code length") 297 } 298 299 cmap.codeMap[numBytes-1][srcCode] = toCode 300 } 301 302 return nil 303 } 304 305 // parseBfrange parses a bfrange section of a CMap file. 306 func (cmap *CMap) parseBfrange() error { 307 for { 308 // The specifications are in pairs of 3. 309 // <srcCodeFrom> <srcCodeTo> <target> 310 // where target can be either <destFrom> as a hex code, or a list. 311 312 // Src code from. 313 var srcCodeFrom uint64 314 var numBytes int 315 { 316 o, err := cmap.parseObject() 317 if err != nil { 318 if err == io.EOF { 319 break 320 } 321 return err 322 } 323 324 switch v := o.(type) { 325 case cmapOperand: 326 if v.Operand == endbfrange { 327 return nil 328 } 329 return errors.New("Unexpected operand") 330 case cmapHexString: 331 srcCodeFrom = hexToUint64(v) 332 numBytes = v.numBytes 333 default: 334 return errors.New("Unexpected type") 335 } 336 } 337 338 // Src code to. 339 var srcCodeTo uint64 340 { 341 o, err := cmap.parseObject() 342 if err != nil { 343 if err == io.EOF { 344 break 345 } 346 return err 347 } 348 349 switch v := o.(type) { 350 case cmapOperand: 351 if v.Operand == endbfrange { 352 return nil 353 } 354 return errors.New("Unexpected operand") 355 case cmapHexString: 356 srcCodeTo = hexToUint64(v) 357 default: 358 return errors.New("Unexpected type") 359 } 360 } 361 362 // target(s). 363 o, err := cmap.parseObject() 364 if err != nil { 365 if err == io.EOF { 366 break 367 } 368 return err 369 } 370 371 if numBytes <= 0 || numBytes > 4 { 372 return errors.New("Invalid code length") 373 } 374 375 switch v := o.(type) { 376 case cmapArray: 377 sc := srcCodeFrom 378 for _, o := range v.Array { 379 hexs, ok := o.(cmapHexString) 380 if !ok { 381 return errors.New("Non-hex string in array") 382 } 383 cmap.codeMap[numBytes-1][sc] = hexToString(hexs) 384 sc++ 385 } 386 if sc != srcCodeTo+1 { 387 return errors.New("Invalid number of items in array") 388 } 389 case cmapHexString: 390 // <srcCodeFrom> <srcCodeTo> <dstCode>, maps [from,to] to [dstCode,dstCode+to-from]. 391 // in hex format. 392 target := hexToUint64(v) 393 i := uint64(0) 394 for sc := srcCodeFrom; sc <= srcCodeTo; sc++ { 395 r := target + i 396 cmap.codeMap[numBytes-1][sc] = string(r) 397 i++ 398 } 399 default: 400 return errors.New("Unexpected type") 401 } 402 } 403 404 return nil 405 }