github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/internal/cmap/parser.go (about) 1 /* 2 * This file is subject to the terms and conditions defined in 3 * file 'LICENSE.md', which is part of this source code package. 4 */ 5 6 package cmap 7 8 import ( 9 "bufio" 10 "bytes" 11 "errors" 12 "fmt" 13 "io" 14 "strconv" 15 16 "encoding/hex" 17 18 "github.com/unidoc/unidoc/common" 19 "github.com/unidoc/unidoc/pdf/core" 20 ) 21 22 // cMapParser parses CMap character to unicode mapping files. 23 type cMapParser struct { 24 reader *bufio.Reader 25 } 26 27 // cMapParser creates a new instance of the PDF CMap parser from input data. 28 func newCMapParser(content []byte) *cMapParser { 29 parser := cMapParser{} 30 31 buffer := bytes.NewBuffer(content) 32 parser.reader = bufio.NewReader(buffer) 33 34 return &parser 35 } 36 37 // Detect the signature at the current file position and parse 38 // the corresponding object. 39 func (p *cMapParser) parseObject() (cmapObject, error) { 40 p.skipSpaces() 41 for { 42 bb, err := p.reader.Peek(2) 43 if err != nil { 44 return nil, err 45 } 46 47 if bb[0] == '%' { 48 p.parseComment() 49 p.skipSpaces() 50 continue 51 } else if bb[0] == '/' { 52 name, err := p.parseName() 53 return name, err 54 } else if bb[0] == '(' { 55 str, err := p.parseString() 56 return str, err 57 } else if bb[0] == '[' { 58 arr, err := p.parseArray() 59 return arr, err 60 } else if (bb[0] == '<') && (bb[1] == '<') { 61 dict, err := p.parseDict() 62 return dict, err 63 } else if bb[0] == '<' { 64 shex, err := p.parseHexString() 65 return shex, err 66 } else if core.IsDecimalDigit(bb[0]) || (bb[0] == '-' && core.IsDecimalDigit(bb[1])) { 67 number, err := p.parseNumber() 68 if err != nil { 69 return nil, err 70 } 71 return number, nil 72 } else { 73 // Operand? 74 operand, err := p.parseOperand() 75 if err != nil { 76 return nil, err 77 } 78 79 return operand, nil 80 } 81 } 82 } 83 84 // Skip over any spaces. Returns the number of spaces skipped and 85 // an error if any. 86 func (p *cMapParser) skipSpaces() (int, error) { 87 cnt := 0 88 for { 89 bb, err := p.reader.Peek(1) 90 if err != nil { 91 return 0, err 92 } 93 if core.IsWhiteSpace(bb[0]) { 94 p.reader.ReadByte() 95 cnt++ 96 } else { 97 break 98 } 99 } 100 101 return cnt, nil 102 } 103 104 // parseComment reads a comment line starting with '%'. 105 func (p *cMapParser) parseComment() (string, error) { 106 var r bytes.Buffer 107 108 _, err := p.skipSpaces() 109 if err != nil { 110 return r.String(), err 111 } 112 113 isFirst := true 114 for { 115 bb, err := p.reader.Peek(1) 116 if err != nil { 117 common.Log.Debug("Error %s", err.Error()) 118 return r.String(), err 119 } 120 if isFirst && bb[0] != '%' { 121 return r.String(), errors.New("Comment should start with %") 122 } 123 isFirst = false 124 if (bb[0] != '\r') && (bb[0] != '\n') { 125 b, _ := p.reader.ReadByte() 126 r.WriteByte(b) 127 } else { 128 break 129 } 130 } 131 return r.String(), nil 132 } 133 134 // Parse a name starting with '/'. 135 func (p *cMapParser) parseName() (cmapName, error) { 136 name := "" 137 nameStarted := false 138 for { 139 bb, err := p.reader.Peek(1) 140 if err == io.EOF { 141 break // Can happen when loading from object stream. 142 } 143 if err != nil { 144 return cmapName{name}, err 145 } 146 147 if !nameStarted { 148 // Should always start with '/', otherwise not valid. 149 if bb[0] == '/' { 150 nameStarted = true 151 p.reader.ReadByte() 152 } else { 153 common.Log.Debug("ERROR Name starting with %s (% x)", bb, bb) 154 return cmapName{name}, fmt.Errorf("Invalid name: (%c)", bb[0]) 155 } 156 } else { 157 if core.IsWhiteSpace(bb[0]) { 158 break 159 } else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') { 160 break // Looks like start of next statement. 161 } else if bb[0] == '#' { 162 hexcode, err := p.reader.Peek(3) 163 if err != nil { 164 return cmapName{name}, err 165 } 166 p.reader.Discard(3) 167 168 code, err := hex.DecodeString(string(hexcode[1:3])) 169 if err != nil { 170 return cmapName{name}, err 171 } 172 name += string(code) 173 } else { 174 b, _ := p.reader.ReadByte() 175 name += string(b) 176 } 177 } 178 } 179 180 return cmapName{name}, nil 181 } 182 183 // A string starts with '(' and ends with ')'. 184 func (p *cMapParser) parseString() (cmapString, error) { 185 p.reader.ReadByte() 186 187 buf := bytes.Buffer{} 188 189 count := 1 190 for { 191 bb, err := p.reader.Peek(1) 192 if err != nil { 193 return cmapString{buf.String()}, err 194 } 195 196 if bb[0] == '\\' { // Escape sequence. 197 p.reader.ReadByte() // Skip the escape \ byte. 198 b, err := p.reader.ReadByte() 199 if err != nil { 200 return cmapString{buf.String()}, err 201 } 202 203 // Octal '\ddd' number (base 8). 204 if core.IsOctalDigit(b) { 205 bb, err := p.reader.Peek(2) 206 if err != nil { 207 return cmapString{buf.String()}, err 208 } 209 210 numeric := []byte{} 211 numeric = append(numeric, b) 212 for _, val := range bb { 213 if core.IsOctalDigit(val) { 214 numeric = append(numeric, val) 215 } else { 216 break 217 } 218 } 219 p.reader.Discard(len(numeric) - 1) 220 221 common.Log.Trace("Numeric string \"%s\"", numeric) 222 code, err := strconv.ParseUint(string(numeric), 8, 32) 223 if err != nil { 224 return cmapString{buf.String()}, err 225 } 226 buf.WriteByte(byte(code)) 227 continue 228 } 229 230 switch b { 231 case 'n': 232 buf.WriteByte('\n') 233 case 'r': 234 buf.WriteByte('\r') 235 case 't': 236 buf.WriteByte('\t') 237 case 'b': 238 buf.WriteByte('\b') 239 case 'f': 240 buf.WriteByte('\f') 241 case '(': 242 buf.WriteByte('(') 243 case ')': 244 buf.WriteByte(')') 245 case '\\': 246 buf.WriteByte('\\') 247 } 248 249 continue 250 } else if bb[0] == '(' { 251 count++ 252 } else if bb[0] == ')' { 253 count-- 254 if count == 0 { 255 p.reader.ReadByte() 256 break 257 } 258 } 259 260 b, _ := p.reader.ReadByte() 261 buf.WriteByte(b) 262 } 263 264 return cmapString{buf.String()}, nil 265 } 266 267 // Starts with '<' ends with '>'. 268 // Currently not converting the hex codes to characters. 269 func (p *cMapParser) parseHexString() (cmapHexString, error) { 270 p.reader.ReadByte() 271 272 hextable := []byte("0123456789abcdefABCDEF") 273 274 buf := bytes.Buffer{} 275 276 //tmp := []byte{} 277 for { 278 p.skipSpaces() 279 280 bb, err := p.reader.Peek(1) 281 if err != nil { 282 return cmapHexString{numBytes: 0, b: []byte("")}, err 283 } 284 285 if bb[0] == '>' { 286 p.reader.ReadByte() 287 break 288 } 289 290 b, _ := p.reader.ReadByte() 291 if bytes.IndexByte(hextable, b) >= 0 { 292 buf.WriteByte(b) 293 } 294 } 295 296 if buf.Len()%2 == 1 { 297 buf.WriteByte('0') 298 } 299 numBytes := buf.Len() / 2 300 301 hexb, _ := hex.DecodeString(buf.String()) 302 return cmapHexString{numBytes: numBytes, b: hexb}, nil 303 } 304 305 // Starts with '[' ends with ']'. Can contain any kinds of direct objects. 306 func (p *cMapParser) parseArray() (cmapArray, error) { 307 arr := cmapArray{} 308 arr.Array = []cmapObject{} 309 310 p.reader.ReadByte() 311 312 for { 313 p.skipSpaces() 314 315 bb, err := p.reader.Peek(1) 316 if err != nil { 317 return arr, err 318 } 319 320 if bb[0] == ']' { 321 p.reader.ReadByte() 322 break 323 } 324 325 obj, err := p.parseObject() 326 if err != nil { 327 return arr, err 328 } 329 arr.Array = append(arr.Array, obj) 330 } 331 332 return arr, nil 333 } 334 335 // Reads and parses a PDF dictionary object enclosed with '<<' and '>>' 336 func (p *cMapParser) parseDict() (cmapDict, error) { 337 common.Log.Trace("Reading PDF Dict!") 338 339 dict := makeDict() 340 341 // Pass the '<<' 342 c, _ := p.reader.ReadByte() 343 if c != '<' { 344 return dict, errors.New("Invalid dict") 345 } 346 c, _ = p.reader.ReadByte() 347 if c != '<' { 348 return dict, errors.New("Invalid dict") 349 } 350 351 for { 352 p.skipSpaces() 353 354 bb, err := p.reader.Peek(2) 355 if err != nil { 356 return dict, err 357 } 358 359 if (bb[0] == '>') && (bb[1] == '>') { 360 p.reader.ReadByte() 361 p.reader.ReadByte() 362 break 363 } 364 365 key, err := p.parseName() 366 common.Log.Trace("Key: %s", key.Name) 367 if err != nil { 368 common.Log.Debug("ERROR Returning name err %s", err) 369 return dict, err 370 } 371 372 p.skipSpaces() 373 374 val, err := p.parseObject() 375 if err != nil { 376 return dict, err 377 } 378 dict.Dict[key.Name] = val 379 380 // Skip "def" which optionally follows key value dict definitions in CMaps. 381 p.skipSpaces() 382 bb, err = p.reader.Peek(3) 383 if err != nil { 384 return dict, err 385 } 386 if string(bb) == "def" { 387 p.reader.Discard(3) 388 } 389 390 } 391 392 return dict, nil 393 } 394 395 func (p *cMapParser) parseNumber() (cmapObject, error) { 396 isFloat := false 397 allowSigns := true 398 399 numStr := bytes.Buffer{} 400 for { 401 bb, err := p.reader.Peek(1) 402 if err == io.EOF { 403 break 404 } 405 if err != nil { 406 return nil, err 407 } 408 if allowSigns && (bb[0] == '-' || bb[0] == '+') { 409 // Only appear in the beginning, otherwise serves as a delimiter. 410 b, _ := p.reader.ReadByte() 411 numStr.WriteByte(b) 412 allowSigns = false // Only allowed in beginning, and after e (exponential). 413 } else if core.IsDecimalDigit(bb[0]) { 414 b, _ := p.reader.ReadByte() 415 numStr.WriteByte(b) 416 } else if bb[0] == '.' { 417 b, _ := p.reader.ReadByte() 418 numStr.WriteByte(b) 419 isFloat = true 420 } else if bb[0] == 'e' { 421 // Exponential number format. 422 b, _ := p.reader.ReadByte() 423 numStr.WriteByte(b) 424 isFloat = true 425 allowSigns = true 426 } else { 427 break 428 } 429 } 430 431 if isFloat { 432 fVal, err := strconv.ParseFloat(numStr.String(), 64) 433 o := cmapFloat{fVal} 434 return o, err 435 } 436 intVal, err := strconv.ParseInt(numStr.String(), 10, 64) 437 o := cmapInt{intVal} 438 return o, err 439 } 440 441 // An operand is a text command represented by a word. 442 func (p *cMapParser) parseOperand() (cmapOperand, error) { 443 op := cmapOperand{} 444 445 buf := bytes.Buffer{} 446 for { 447 bb, err := p.reader.Peek(1) 448 if err != nil { 449 if err == io.EOF { 450 break 451 } 452 return op, err 453 } 454 if core.IsDelimiter(bb[0]) { 455 break 456 } 457 if core.IsWhiteSpace(bb[0]) { 458 break 459 } 460 461 b, _ := p.reader.ReadByte() 462 buf.WriteByte(b) 463 } 464 465 if buf.Len() == 0 { 466 return op, fmt.Errorf("Invalid operand (empty)") 467 } 468 469 op.Operand = buf.String() 470 471 return op, nil 472 }