github.com/unidoc/unidoc@v2.2.0+incompatible/pdf/contentstream/parser.go (about) 1 /* 2 * This file is subject to the terms and conditions defined in 3 * file 'LICENSE.md', which is part of this source code package. 4 */ 5 6 package contentstream 7 8 import ( 9 "bufio" 10 "bytes" 11 "encoding/hex" 12 "errors" 13 "fmt" 14 "io" 15 "strconv" 16 17 "github.com/unidoc/unidoc/common" 18 . "github.com/unidoc/unidoc/pdf/core" 19 ) 20 21 // Content stream parser. 22 type ContentStreamParser struct { 23 reader *bufio.Reader 24 } 25 26 // Create a new instance of the content stream parser from an input content 27 // stream string. 28 func NewContentStreamParser(contentStr string) *ContentStreamParser { 29 // Each command has parameters and an operand (command). 30 parser := ContentStreamParser{} 31 32 buffer := bytes.NewBufferString(contentStr + "\n") // Add newline at end to get last operand without EOF error. 33 parser.reader = bufio.NewReader(buffer) 34 35 return &parser 36 } 37 38 // Parses all commands in content stream, returning a list of operation data. 39 func (this *ContentStreamParser) Parse() (*ContentStreamOperations, error) { 40 operations := ContentStreamOperations{} 41 42 for { 43 operation := ContentStreamOperation{} 44 45 for { 46 obj, err, isOperand := this.parseObject() 47 if err != nil { 48 if err == io.EOF { 49 // End of data. Successful exit point. 50 return &operations, nil 51 } 52 return &operations, err 53 } 54 if isOperand { 55 operation.Operand = string(*obj.(*PdfObjectString)) 56 operations = append(operations, &operation) 57 break 58 } else { 59 operation.Params = append(operation.Params, obj) 60 } 61 } 62 63 if operation.Operand == "BI" { 64 // Parse an inline image, reads everything between the "BI" and "EI". 65 // The image is stored as the parameter. 66 im, err := this.ParseInlineImage() 67 if err != nil { 68 return &operations, err 69 } 70 operation.Params = append(operation.Params, im) 71 } 72 } 73 } 74 75 // Skip over any spaces. Returns the number of spaces skipped and 76 // an error if any. 77 func (this *ContentStreamParser) skipSpaces() (int, error) { 78 cnt := 0 79 for { 80 bb, err := this.reader.Peek(1) 81 if err != nil { 82 return 0, err 83 } 84 if IsWhiteSpace(bb[0]) { 85 this.reader.ReadByte() 86 cnt++ 87 } else { 88 break 89 } 90 } 91 92 return cnt, nil 93 } 94 95 // Skip over comments and spaces. Can handle multi-line comments. 96 func (this *ContentStreamParser) skipComments() error { 97 if _, err := this.skipSpaces(); err != nil { 98 return err 99 } 100 101 isFirst := true 102 for { 103 bb, err := this.reader.Peek(1) 104 if err != nil { 105 common.Log.Debug("Error %s", err.Error()) 106 return err 107 } 108 if isFirst && bb[0] != '%' { 109 // Not a comment clearly. 110 return nil 111 } else { 112 isFirst = false 113 } 114 if (bb[0] != '\r') && (bb[0] != '\n') { 115 this.reader.ReadByte() 116 } else { 117 break 118 } 119 } 120 121 // Call recursively to handle multiline comments. 122 return this.skipComments() 123 } 124 125 // Parse a name starting with '/'. 126 func (this *ContentStreamParser) parseName() (PdfObjectName, error) { 127 name := "" 128 nameStarted := false 129 for { 130 bb, err := this.reader.Peek(1) 131 if err == io.EOF { 132 break // Can happen when loading from object stream. 133 } 134 if err != nil { 135 return PdfObjectName(name), err 136 } 137 138 if !nameStarted { 139 // Should always start with '/', otherwise not valid. 140 if bb[0] == '/' { 141 nameStarted = true 142 this.reader.ReadByte() 143 } else { 144 common.Log.Error("Name starting with %s (% x)", bb, bb) 145 return PdfObjectName(name), fmt.Errorf("Invalid name: (%c)", bb[0]) 146 } 147 } else { 148 if IsWhiteSpace(bb[0]) { 149 break 150 } else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') { 151 break // Looks like start of next statement. 152 } else if bb[0] == '#' { 153 hexcode, err := this.reader.Peek(3) 154 if err != nil { 155 return PdfObjectName(name), err 156 } 157 this.reader.Discard(3) 158 159 code, err := hex.DecodeString(string(hexcode[1:3])) 160 if err != nil { 161 return PdfObjectName(name), err 162 } 163 name += string(code) 164 } else { 165 b, _ := this.reader.ReadByte() 166 name += string(b) 167 } 168 } 169 } 170 return PdfObjectName(name), nil 171 } 172 173 // Numeric objects. 174 // Section 7.3.3. 175 // Integer or Float. 176 // 177 // An integer shall be written as one or more decimal digits optionally 178 // preceded by a sign. The value shall be interpreted as a signed 179 // decimal integer and shall be converted to an integer object. 180 // 181 // A real value shall be written as one or more decimal digits with an 182 // optional sign and a leading, trailing, or embedded PERIOD (2Eh) 183 // (decimal point). The value shall be interpreted as a real number 184 // and shall be converted to a real object. 185 // 186 // Regarding exponential numbers: 7.3.3 Numeric Objects: 187 // A conforming writer shall not use the PostScript syntax for numbers 188 // with non-decimal radices (such as 16#FFFE) or in exponential format 189 // (such as 6.02E23). 190 // Nonetheless, we sometimes get numbers with exponential format, so 191 // we will support it in the reader (no confusion with other types, so 192 // no compromise). 193 func (this *ContentStreamParser) parseNumber() (PdfObject, error) { 194 isFloat := false 195 allowSigns := true 196 numStr := "" 197 for { 198 common.Log.Trace("Parsing number \"%s\"", numStr) 199 bb, err := this.reader.Peek(1) 200 if err == io.EOF { 201 // GH: EOF handling. Handle EOF like end of line. Can happen with 202 // encoded object streams that the object is at the end. 203 // In other cases, we will get the EOF error elsewhere at any rate. 204 break // Handle like EOF 205 } 206 if err != nil { 207 common.Log.Error("ERROR %s", err) 208 return nil, err 209 } 210 if allowSigns && (bb[0] == '-' || bb[0] == '+') { 211 // Only appear in the beginning, otherwise serves as a delimiter. 212 b, _ := this.reader.ReadByte() 213 numStr += string(b) 214 allowSigns = false // Only allowed in beginning, and after e (exponential). 215 } else if IsDecimalDigit(bb[0]) { 216 b, _ := this.reader.ReadByte() 217 numStr += string(b) 218 } else if bb[0] == '.' { 219 b, _ := this.reader.ReadByte() 220 numStr += string(b) 221 isFloat = true 222 } else if bb[0] == 'e' { 223 // Exponential number format. 224 b, _ := this.reader.ReadByte() 225 numStr += string(b) 226 isFloat = true 227 allowSigns = true 228 } else { 229 break 230 } 231 } 232 233 if isFloat { 234 fVal, err := strconv.ParseFloat(numStr, 64) 235 if err != nil { 236 common.Log.Debug("Error parsing number %q err=%v. Using 0.0. Output may be incorrect", numStr, err) 237 fVal = 0.0 238 err = nil 239 } 240 o := PdfObjectFloat(fVal) 241 return &o, err 242 } else { 243 intVal, err := strconv.ParseInt(numStr, 10, 64) 244 if err != nil { 245 common.Log.Debug("Error parsing integer %q err=%v. Using 0. Output may be incorrect", numStr, err) 246 intVal = 0 247 err = nil 248 } 249 o := PdfObjectInteger(intVal) 250 return &o, err 251 } 252 } 253 254 // A string starts with '(' and ends with ')'. 255 func (this *ContentStreamParser) parseString() (PdfObjectString, error) { 256 this.reader.ReadByte() 257 258 bytes := []byte{} 259 count := 1 260 for { 261 bb, err := this.reader.Peek(1) 262 if err != nil { 263 return PdfObjectString(bytes), err 264 } 265 266 if bb[0] == '\\' { // Escape sequence. 267 this.reader.ReadByte() // Skip the escape \ byte. 268 b, err := this.reader.ReadByte() 269 if err != nil { 270 return PdfObjectString(bytes), err 271 } 272 273 // Octal '\ddd' number (base 8). 274 if IsOctalDigit(b) { 275 bb, err := this.reader.Peek(2) 276 if err != nil { 277 return PdfObjectString(bytes), err 278 } 279 280 numeric := []byte{} 281 numeric = append(numeric, b) 282 for _, val := range bb { 283 if IsOctalDigit(val) { 284 numeric = append(numeric, val) 285 } else { 286 break 287 } 288 } 289 this.reader.Discard(len(numeric) - 1) 290 291 common.Log.Trace("Numeric string \"%s\"", numeric) 292 code, err := strconv.ParseUint(string(numeric), 8, 32) 293 if err != nil { 294 return PdfObjectString(bytes), err 295 } 296 bytes = append(bytes, byte(code)) 297 continue 298 } 299 300 switch b { 301 case 'n': 302 bytes = append(bytes, '\n') 303 case 'r': 304 bytes = append(bytes, '\r') 305 case 't': 306 bytes = append(bytes, '\t') 307 case 'b': 308 bytes = append(bytes, '\b') 309 case 'f': 310 bytes = append(bytes, '\f') 311 case '(': 312 bytes = append(bytes, '(') 313 case ')': 314 bytes = append(bytes, ')') 315 case '\\': 316 bytes = append(bytes, '\\') 317 } 318 319 continue 320 } else if bb[0] == '(' { 321 count++ 322 } else if bb[0] == ')' { 323 count-- 324 if count == 0 { 325 this.reader.ReadByte() 326 break 327 } 328 } 329 330 b, _ := this.reader.ReadByte() 331 bytes = append(bytes, b) 332 } 333 334 return PdfObjectString(bytes), nil 335 } 336 337 // Starts with '<' ends with '>'. 338 func (this *ContentStreamParser) parseHexString() (PdfObjectString, error) { 339 this.reader.ReadByte() 340 341 hextable := []byte("0123456789abcdefABCDEF") 342 343 tmp := []byte{} 344 for { 345 this.skipSpaces() 346 347 bb, err := this.reader.Peek(1) 348 if err != nil { 349 return PdfObjectString(""), err 350 } 351 352 if bb[0] == '>' { 353 this.reader.ReadByte() 354 break 355 } 356 357 b, _ := this.reader.ReadByte() 358 if bytes.IndexByte(hextable, b) >= 0 { 359 tmp = append(tmp, b) 360 } 361 } 362 363 if len(tmp)%2 == 1 { 364 tmp = append(tmp, '0') 365 } 366 367 buf, _ := hex.DecodeString(string(tmp)) 368 return PdfObjectString(buf), nil 369 } 370 371 // Starts with '[' ends with ']'. Can contain any kinds of direct objects. 372 func (this *ContentStreamParser) parseArray() (PdfObjectArray, error) { 373 arr := make(PdfObjectArray, 0) 374 375 this.reader.ReadByte() 376 377 for { 378 this.skipSpaces() 379 380 bb, err := this.reader.Peek(1) 381 if err != nil { 382 return arr, err 383 } 384 385 if bb[0] == ']' { 386 this.reader.ReadByte() 387 break 388 } 389 390 obj, err, _ := this.parseObject() 391 if err != nil { 392 return arr, err 393 } 394 arr = append(arr, obj) 395 } 396 397 return arr, nil 398 } 399 400 // Parse bool object. 401 func (this *ContentStreamParser) parseBool() (PdfObjectBool, error) { 402 bb, err := this.reader.Peek(4) 403 if err != nil { 404 return PdfObjectBool(false), err 405 } 406 if (len(bb) >= 4) && (string(bb[:4]) == "true") { 407 this.reader.Discard(4) 408 return PdfObjectBool(true), nil 409 } 410 411 bb, err = this.reader.Peek(5) 412 if err != nil { 413 return PdfObjectBool(false), err 414 } 415 if (len(bb) >= 5) && (string(bb[:5]) == "false") { 416 this.reader.Discard(5) 417 return PdfObjectBool(false), nil 418 } 419 420 return PdfObjectBool(false), errors.New("Unexpected boolean string") 421 } 422 423 // Parse null object. 424 func (this *ContentStreamParser) parseNull() (PdfObjectNull, error) { 425 _, err := this.reader.Discard(4) 426 return PdfObjectNull{}, err 427 } 428 429 func (this *ContentStreamParser) parseDict() (*PdfObjectDictionary, error) { 430 common.Log.Trace("Reading content stream dict!") 431 432 dict := MakeDict() 433 434 // Pass the '<<' 435 c, _ := this.reader.ReadByte() 436 if c != '<' { 437 return nil, errors.New("Invalid dict") 438 } 439 c, _ = this.reader.ReadByte() 440 if c != '<' { 441 return nil, errors.New("Invalid dict") 442 } 443 444 for { 445 this.skipSpaces() 446 447 bb, err := this.reader.Peek(2) 448 if err != nil { 449 return nil, err 450 } 451 452 common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb)) 453 if (bb[0] == '>') && (bb[1] == '>') { 454 common.Log.Trace("EOF dictionary") 455 this.reader.ReadByte() 456 this.reader.ReadByte() 457 break 458 } 459 common.Log.Trace("Parse the name!") 460 461 keyName, err := this.parseName() 462 common.Log.Trace("Key: %s", keyName) 463 if err != nil { 464 common.Log.Debug("ERROR Returning name err %s", err) 465 return nil, err 466 } 467 468 if len(keyName) > 4 && keyName[len(keyName)-4:] == "null" { 469 // Some writers have a bug where the null is appended without 470 // space. For example "\Boundsnull" 471 newKey := keyName[0 : len(keyName)-4] 472 common.Log.Trace("Taking care of null bug (%s)", keyName) 473 common.Log.Trace("New key \"%s\" = null", newKey) 474 this.skipSpaces() 475 bb, _ := this.reader.Peek(1) 476 if bb[0] == '/' { 477 dict.Set(newKey, MakeNull()) 478 continue 479 } 480 } 481 482 this.skipSpaces() 483 484 val, err, _ := this.parseObject() 485 if err != nil { 486 return nil, err 487 } 488 dict.Set(keyName, val) 489 490 common.Log.Trace("dict[%s] = %s", keyName, val.String()) 491 } 492 493 return dict, nil 494 } 495 496 // An operand is a text command represented by a word. 497 func (this *ContentStreamParser) parseOperand() (PdfObjectString, error) { 498 bytes := []byte{} 499 for { 500 bb, err := this.reader.Peek(1) 501 if err != nil { 502 return PdfObjectString(bytes), err 503 } 504 if IsDelimiter(bb[0]) { 505 break 506 } 507 if IsWhiteSpace(bb[0]) { 508 break 509 } 510 511 b, _ := this.reader.ReadByte() 512 bytes = append(bytes, b) 513 } 514 515 return PdfObjectString(bytes), nil 516 } 517 518 // Parse a generic object. Returns the object, an error code, and a bool 519 // value indicating whether the object is an operand. An operand 520 // is contained in a pdf string object. 521 func (this *ContentStreamParser) parseObject() (PdfObject, error, bool) { 522 // Determine the kind of object. 523 // parse it! 524 // make a list of operands, then once operand arrives put into a package. 525 526 this.skipSpaces() 527 for { 528 bb, err := this.reader.Peek(2) 529 if err != nil { 530 return nil, err, false 531 } 532 533 common.Log.Trace("Peek string: %s", string(bb)) 534 // Determine type. 535 if bb[0] == '%' { 536 this.skipComments() 537 continue 538 } else if bb[0] == '/' { 539 name, err := this.parseName() 540 common.Log.Trace("->Name: '%s'", name) 541 return &name, err, false 542 } else if bb[0] == '(' { 543 common.Log.Trace("->String!") 544 str, err := this.parseString() 545 common.Log.Trace("(%s)\n", str.String()) 546 return &str, err, false 547 } else if bb[0] == '<' && bb[1] != '<' { 548 common.Log.Trace("->Hex String!") 549 str, err := this.parseHexString() 550 return &str, err, false 551 } else if bb[0] == '[' { 552 common.Log.Trace("->Array!") 553 arr, err := this.parseArray() 554 return &arr, err, false 555 } else if IsFloatDigit(bb[0]) || (bb[0] == '-' && IsFloatDigit(bb[1])) { 556 common.Log.Trace("->Number!") 557 number, err := this.parseNumber() 558 return number, err, false 559 } else if bb[0] == '<' && bb[1] == '<' { 560 dict, err := this.parseDict() 561 return dict, err, false 562 } else { 563 // Otherwise, can be: keyword such as "null", "false", "true" or an operand... 564 common.Log.Trace("->Operand or bool?") 565 // Let's peek farther to find out. 566 bb, _ = this.reader.Peek(5) 567 peekStr := string(bb) 568 common.Log.Trace("cont Peek str: %s", peekStr) 569 570 if (len(peekStr) > 3) && (peekStr[:4] == "null") { 571 null, err := this.parseNull() 572 return &null, err, false 573 } else if (len(peekStr) > 4) && (peekStr[:5] == "false") { 574 b, err := this.parseBool() 575 return &b, err, false 576 } else if (len(peekStr) > 3) && (peekStr[:4] == "true") { 577 b, err := this.parseBool() 578 return &b, err, false 579 } 580 581 operand, err := this.parseOperand() 582 if err != nil { 583 return &operand, err, false 584 } 585 if len(operand.String()) < 1 { 586 return &operand, ErrInvalidOperand, false 587 } 588 return &operand, nil, true 589 } 590 } 591 }