github.com/TeaOSLab/EdgeNode@v1.3.8/internal/waf/injectionutils/libinjection/src/libinjection_html5.c (about) 1 #include "libinjection_html5.h" 2 3 #include <string.h> 4 #include <assert.h> 5 6 #ifdef DEBUG 7 #include <stdio.h> 8 #define TRACE() printf("%s:%d\n", __FUNCTION__, __LINE__) 9 #else 10 #define TRACE() 11 #endif 12 13 14 #define CHAR_EOF -1 15 #define CHAR_NULL 0 16 #define CHAR_BANG 33 17 #define CHAR_DOUBLE 34 18 #define CHAR_PERCENT 37 19 #define CHAR_SINGLE 39 20 #define CHAR_DASH 45 21 #define CHAR_SLASH 47 22 #define CHAR_LT 60 23 #define CHAR_EQUALS 61 24 #define CHAR_GT 62 25 #define CHAR_QUESTION 63 26 #define CHAR_RIGHTB 93 27 #define CHAR_TICK 96 28 29 /* prototypes */ 30 31 static int h5_skip_white(h5_state_t* hs); 32 static int h5_is_white(char ch); 33 static int h5_state_eof(h5_state_t* hs); 34 static int h5_state_data(h5_state_t* hs); 35 static int h5_state_tag_open(h5_state_t* hs); 36 static int h5_state_tag_name(h5_state_t* hs); 37 static int h5_state_tag_name_close(h5_state_t* hs); 38 static int h5_state_end_tag_open(h5_state_t* hs); 39 static int h5_state_self_closing_start_tag(h5_state_t* hs); 40 static int h5_state_attribute_name(h5_state_t* hs); 41 static int h5_state_after_attribute_name(h5_state_t* hs); 42 static int h5_state_before_attribute_name(h5_state_t* hs); 43 static int h5_state_before_attribute_value(h5_state_t* hs); 44 static int h5_state_attribute_value_double_quote(h5_state_t* hs); 45 static int h5_state_attribute_value_single_quote(h5_state_t* hs); 46 static int h5_state_attribute_value_back_quote(h5_state_t* hs); 47 static int h5_state_attribute_value_no_quote(h5_state_t* hs); 48 static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs); 49 static int h5_state_comment(h5_state_t* hs); 50 static int h5_state_cdata(h5_state_t* hs); 51 52 53 /* 12.2.4.44 */ 54 static int h5_state_bogus_comment(h5_state_t* hs); 55 static int h5_state_bogus_comment2(h5_state_t* hs); 56 57 /* 12.2.4.45 */ 58 static int h5_state_markup_declaration_open(h5_state_t* hs); 59 60 /* 8.2.4.52 */ 61 static int h5_state_doctype(h5_state_t* hs); 62 63 /** 64 * public function 65 */ 66 void libinjection_h5_init(h5_state_t* hs, const char* s, size_t len, enum html5_flags flags) 67 { 68 memset(hs, 0, sizeof(h5_state_t)); 69 hs->s = s; 70 hs->len = len; 71 72 switch (flags) { 73 case DATA_STATE: 74 hs->state = h5_state_data; 75 break; 76 case VALUE_NO_QUOTE: 77 hs->state = h5_state_before_attribute_name; 78 break; 79 case VALUE_SINGLE_QUOTE: 80 hs->state = h5_state_attribute_value_single_quote; 81 break; 82 case VALUE_DOUBLE_QUOTE: 83 hs->state = h5_state_attribute_value_double_quote; 84 break; 85 case VALUE_BACK_QUOTE: 86 hs->state = h5_state_attribute_value_back_quote; 87 break; 88 } 89 } 90 91 /** 92 * public function 93 */ 94 int libinjection_h5_next(h5_state_t* hs) 95 { 96 assert(hs->state != NULL); 97 return (*hs->state)(hs); 98 } 99 100 /** 101 * Everything below here is private 102 * 103 */ 104 105 106 static int h5_is_white(char ch) 107 { 108 /* 109 * \t = horizontal tab = 0x09 110 * \n = newline = 0x0A 111 * \v = vertical tab = 0x0B 112 * \f = form feed = 0x0C 113 * \r = cr = 0x0D 114 */ 115 return strchr(" \t\n\v\f\r", ch) != NULL; 116 } 117 118 static int h5_skip_white(h5_state_t* hs) 119 { 120 char ch; 121 while (hs->pos < hs->len) { 122 ch = hs->s[hs->pos]; 123 switch (ch) { 124 case 0x00: /* IE only */ 125 case 0x20: 126 case 0x09: 127 case 0x0A: 128 case 0x0B: /* IE only */ 129 case 0x0C: 130 case 0x0D: /* IE only */ 131 hs->pos += 1; 132 break; 133 default: 134 return ch; 135 } 136 } 137 return CHAR_EOF; 138 } 139 140 static int h5_state_eof(h5_state_t* hs) 141 { 142 /* eliminate unused function argument warning */ 143 (void)hs; 144 return 0; 145 } 146 147 static int h5_state_data(h5_state_t* hs) 148 { 149 const char* idx; 150 151 TRACE(); 152 assert(hs->len >= hs->pos); 153 idx = (const char*) memchr(hs->s + hs->pos, CHAR_LT, hs->len - hs->pos); 154 if (idx == NULL) { 155 hs->token_start = hs->s + hs->pos; 156 hs->token_len = hs->len - hs->pos; 157 hs->token_type = DATA_TEXT; 158 hs->state = h5_state_eof; 159 if (hs->token_len == 0) { 160 return 0; 161 } 162 } else { 163 hs->token_start = hs->s + hs->pos; 164 hs->token_type = DATA_TEXT; 165 hs->token_len = (size_t)(idx - hs->s) - hs->pos; 166 hs->pos = (size_t)(idx - hs->s) + 1; 167 hs->state = h5_state_tag_open; 168 if (hs->token_len == 0) { 169 return h5_state_tag_open(hs); 170 } 171 } 172 return 1; 173 } 174 175 /** 176 * 12 2.4.8 177 */ 178 static int h5_state_tag_open(h5_state_t* hs) 179 { 180 char ch; 181 182 TRACE(); 183 if (hs->pos >= hs->len) { 184 return 0; 185 } 186 ch = hs->s[hs->pos]; 187 if (ch == CHAR_BANG) { 188 hs->pos += 1; 189 return h5_state_markup_declaration_open(hs); 190 } else if (ch == CHAR_SLASH) { 191 hs->pos += 1; 192 hs->is_close = 1; 193 return h5_state_end_tag_open(hs); 194 } else if (ch == CHAR_QUESTION) { 195 hs->pos += 1; 196 return h5_state_bogus_comment(hs); 197 } else if (ch == CHAR_PERCENT) { 198 /* this is not in spec.. alternative comment format used 199 by IE <= 9 and Safari < 4.0.3 */ 200 hs->pos += 1; 201 return h5_state_bogus_comment2(hs); 202 } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { 203 return h5_state_tag_name(hs); 204 } else if (ch == CHAR_NULL) { 205 /* IE-ism NULL characters are ignored */ 206 return h5_state_tag_name(hs); 207 } else { 208 /* user input mistake in configuring state */ 209 if (hs->pos == 0) { 210 return h5_state_data(hs); 211 } 212 hs->token_start = hs->s + hs->pos - 1; 213 hs->token_len = 1; 214 hs->token_type = DATA_TEXT; 215 hs->state = h5_state_data; 216 return 1; 217 } 218 } 219 /** 220 * 12.2.4.9 221 */ 222 static int h5_state_end_tag_open(h5_state_t* hs) 223 { 224 char ch; 225 226 TRACE(); 227 228 if (hs->pos >= hs->len) { 229 return 0; 230 } 231 ch = hs->s[hs->pos]; 232 if (ch == CHAR_GT) { 233 return h5_state_data(hs); 234 } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { 235 return h5_state_tag_name(hs); 236 } 237 238 hs->is_close = 0; 239 return h5_state_bogus_comment(hs); 240 } 241 /* 242 * 243 */ 244 static int h5_state_tag_name_close(h5_state_t* hs) 245 { 246 TRACE(); 247 hs->is_close = 0; 248 hs->token_start = hs->s + hs->pos; 249 hs->token_len = 1; 250 hs->token_type = TAG_NAME_CLOSE; 251 hs->pos += 1; 252 if (hs->pos < hs->len) { 253 hs->state = h5_state_data; 254 } else { 255 hs->state = h5_state_eof; 256 } 257 258 return 1; 259 } 260 261 /** 262 * 12.2.4.10 263 */ 264 static int h5_state_tag_name(h5_state_t* hs) 265 { 266 char ch; 267 size_t pos; 268 269 TRACE(); 270 pos = hs->pos; 271 while (pos < hs->len) { 272 ch = hs->s[pos]; 273 if (ch == 0) { 274 /* special non-standard case */ 275 /* allow nulls in tag name */ 276 /* some old browsers apparently allow and ignore them */ 277 pos += 1; 278 } else if (h5_is_white(ch)) { 279 hs->token_start = hs->s + hs->pos; 280 hs->token_len = pos - hs->pos; 281 hs->token_type = TAG_NAME_OPEN; 282 hs->pos = pos + 1; 283 hs->state = h5_state_before_attribute_name; 284 return 1; 285 } else if (ch == CHAR_SLASH) { 286 hs->token_start = hs->s + hs->pos; 287 hs->token_len = pos - hs->pos; 288 hs->token_type = TAG_NAME_OPEN; 289 hs->pos = pos + 1; 290 hs->state = h5_state_self_closing_start_tag; 291 return 1; 292 } else if (ch == CHAR_GT) { 293 hs->token_start = hs->s + hs->pos; 294 hs->token_len = pos - hs->pos; 295 if (hs->is_close) { 296 hs->pos = pos + 1; 297 hs->is_close = 0; 298 hs->token_type = TAG_CLOSE; 299 hs->state = h5_state_data; 300 } else { 301 hs->pos = pos; 302 hs->token_type = TAG_NAME_OPEN; 303 hs->state = h5_state_tag_name_close; 304 } 305 return 1; 306 } else { 307 pos += 1; 308 } 309 } 310 311 hs->token_start = hs->s + hs->pos; 312 hs->token_len = hs->len - hs->pos; 313 hs->token_type = TAG_NAME_OPEN; 314 hs->state = h5_state_eof; 315 return 1; 316 } 317 318 /** 319 * 12.2.4.34 320 */ 321 static int h5_state_before_attribute_name(h5_state_t* hs) 322 { 323 int ch; 324 325 TRACE(); 326 327 /* for manual tail call optimization, see comment below */ 328 tail_call:; 329 330 ch = h5_skip_white(hs); 331 switch (ch) { 332 case CHAR_EOF: { 333 return 0; 334 } 335 case CHAR_SLASH: { 336 hs->pos += 1; 337 /* Logically, We want to call h5_state_self_closing_start_tag(hs) here. 338 339 As this function may call us back and the compiler 340 might not implement automatic tail call optimization, 341 this might result in a deep recursion. 342 343 We detect this case here and start over with the current state. 344 */ 345 346 if (hs->pos < hs->len && hs->s[hs->pos] != CHAR_GT) { 347 goto tail_call; 348 } 349 return h5_state_self_closing_start_tag(hs); 350 } 351 case CHAR_GT: { 352 hs->state = h5_state_data; 353 hs->token_start = hs->s + hs->pos; 354 hs->token_len = 1; 355 hs->token_type = TAG_NAME_CLOSE; 356 hs->pos += 1; 357 return 1; 358 } 359 default: { 360 return h5_state_attribute_name(hs); 361 } 362 } 363 } 364 365 static int h5_state_attribute_name(h5_state_t* hs) 366 { 367 char ch; 368 size_t pos; 369 370 TRACE(); 371 pos = hs->pos + 1; 372 while (pos < hs->len) { 373 ch = hs->s[pos]; 374 if (h5_is_white(ch)) { 375 hs->token_start = hs->s + hs->pos; 376 hs->token_len = pos - hs->pos; 377 hs->token_type = ATTR_NAME; 378 hs->state = h5_state_after_attribute_name; 379 hs->pos = pos + 1; 380 return 1; 381 } else if (ch == CHAR_SLASH) { 382 hs->token_start = hs->s + hs->pos; 383 hs->token_len = pos - hs->pos; 384 hs->token_type = ATTR_NAME; 385 hs->state = h5_state_self_closing_start_tag; 386 hs->pos = pos + 1; 387 return 1; 388 } else if (ch == CHAR_EQUALS) { 389 hs->token_start = hs->s + hs->pos; 390 hs->token_len = pos - hs->pos; 391 hs->token_type = ATTR_NAME; 392 hs->state = h5_state_before_attribute_value; 393 hs->pos = pos + 1; 394 return 1; 395 } else if (ch == CHAR_GT) { 396 hs->token_start = hs->s + hs->pos; 397 hs->token_len = pos - hs->pos; 398 hs->token_type = ATTR_NAME; 399 hs->state = h5_state_tag_name_close; 400 hs->pos = pos; 401 return 1; 402 } else { 403 pos += 1; 404 } 405 } 406 /* EOF */ 407 hs->token_start = hs->s + hs->pos; 408 hs->token_len = hs->len - hs->pos; 409 hs->token_type = ATTR_NAME; 410 hs->state = h5_state_eof; 411 hs->pos = hs->len; 412 return 1; 413 } 414 415 /** 416 * 12.2.4.36 417 */ 418 static int h5_state_after_attribute_name(h5_state_t* hs) 419 { 420 int c; 421 422 TRACE(); 423 c = h5_skip_white(hs); 424 switch (c) { 425 case CHAR_EOF: { 426 return 0; 427 } 428 case CHAR_SLASH: { 429 hs->pos += 1; 430 return h5_state_self_closing_start_tag(hs); 431 } 432 case CHAR_EQUALS: { 433 hs->pos += 1; 434 return h5_state_before_attribute_value(hs); 435 } 436 case CHAR_GT: { 437 return h5_state_tag_name_close(hs); 438 } 439 default: { 440 return h5_state_attribute_name(hs); 441 } 442 } 443 } 444 445 /** 446 * 12.2.4.37 447 */ 448 static int h5_state_before_attribute_value(h5_state_t* hs) 449 { 450 int c; 451 TRACE(); 452 453 c = h5_skip_white(hs); 454 455 if (c == CHAR_EOF) { 456 hs->state = h5_state_eof; 457 return 0; 458 } 459 460 if (c == CHAR_DOUBLE) { 461 return h5_state_attribute_value_double_quote(hs); 462 } else if (c == CHAR_SINGLE) { 463 return h5_state_attribute_value_single_quote(hs); 464 } else if (c == CHAR_TICK) { 465 /* NON STANDARD IE */ 466 return h5_state_attribute_value_back_quote(hs); 467 } else { 468 return h5_state_attribute_value_no_quote(hs); 469 } 470 } 471 472 473 static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar) 474 { 475 const char* idx; 476 477 TRACE(); 478 479 /* skip initial quote in normal case. 480 * don't do this "if (pos == 0)" since it means we have started 481 * in a non-data state. given an input of '><foo 482 * we want to make 0-length attribute name 483 */ 484 if (hs->pos > 0) { 485 hs->pos += 1; 486 } 487 488 489 idx = (const char*) memchr(hs->s + hs->pos, qchar, hs->len - hs->pos); 490 if (idx == NULL) { 491 hs->token_start = hs->s + hs->pos; 492 hs->token_len = hs->len - hs->pos; 493 hs->token_type = ATTR_VALUE; 494 hs->state = h5_state_eof; 495 } else { 496 hs->token_start = hs->s + hs->pos; 497 hs->token_len = (size_t)(idx - hs->s) - hs->pos; 498 hs->token_type = ATTR_VALUE; 499 hs->state = h5_state_after_attribute_value_quoted_state; 500 hs->pos += hs->token_len + 1; 501 } 502 return 1; 503 } 504 505 static 506 int h5_state_attribute_value_double_quote(h5_state_t* hs) 507 { 508 TRACE(); 509 return h5_state_attribute_value_quote(hs, CHAR_DOUBLE); 510 } 511 512 static 513 int h5_state_attribute_value_single_quote(h5_state_t* hs) 514 { 515 TRACE(); 516 return h5_state_attribute_value_quote(hs, CHAR_SINGLE); 517 } 518 519 static 520 int h5_state_attribute_value_back_quote(h5_state_t* hs) 521 { 522 TRACE(); 523 return h5_state_attribute_value_quote(hs, CHAR_TICK); 524 } 525 526 static int h5_state_attribute_value_no_quote(h5_state_t* hs) 527 { 528 char ch; 529 size_t pos; 530 531 TRACE(); 532 pos = hs->pos; 533 while (pos < hs->len) { 534 ch = hs->s[pos]; 535 if (h5_is_white(ch)) { 536 hs->token_type = ATTR_VALUE; 537 hs->token_start = hs->s + hs->pos; 538 hs->token_len = pos - hs->pos; 539 hs->pos = pos + 1; 540 hs->state = h5_state_before_attribute_name; 541 return 1; 542 } else if (ch == CHAR_GT) { 543 hs->token_type = ATTR_VALUE; 544 hs->token_start = hs->s + hs->pos; 545 hs->token_len = pos - hs->pos; 546 hs->pos = pos; 547 hs->state = h5_state_tag_name_close; 548 return 1; 549 } 550 pos += 1; 551 } 552 TRACE(); 553 /* EOF */ 554 hs->state = h5_state_eof; 555 hs->token_start = hs->s + hs->pos; 556 hs->token_len = hs->len - hs->pos; 557 hs->token_type = ATTR_VALUE; 558 return 1; 559 } 560 561 /** 562 * 12.2.4.41 563 */ 564 static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs) 565 { 566 char ch; 567 568 TRACE(); 569 if (hs->pos >= hs->len) { 570 return 0; 571 } 572 ch = hs->s[hs->pos]; 573 if (h5_is_white(ch)) { 574 hs->pos += 1; 575 return h5_state_before_attribute_name(hs); 576 } else if (ch == CHAR_SLASH) { 577 hs->pos += 1; 578 return h5_state_self_closing_start_tag(hs); 579 } else if (ch == CHAR_GT) { 580 hs->token_start = hs->s + hs->pos; 581 hs->token_len = 1; 582 hs->token_type = TAG_NAME_CLOSE; 583 hs->pos += 1; 584 hs->state = h5_state_data; 585 return 1; 586 } else { 587 return h5_state_before_attribute_name(hs); 588 } 589 } 590 591 /** 592 * 12.2.4.43 593 * 594 * WARNING: This function is partially inlined into h5_state_before_attribute_name() 595 */ 596 static int h5_state_self_closing_start_tag(h5_state_t* hs) 597 { 598 char ch; 599 600 TRACE(); 601 if (hs->pos >= hs->len) { 602 return 0; 603 } 604 ch = hs->s[hs->pos]; 605 if (ch == CHAR_GT) { 606 assert(hs->pos > 0); 607 hs->token_start = hs->s + hs->pos -1; 608 hs->token_len = 2; 609 hs->token_type = TAG_NAME_SELFCLOSE; 610 hs->state = h5_state_data; 611 hs->pos += 1; 612 return 1; 613 } else { 614 return h5_state_before_attribute_name(hs); 615 } 616 } 617 618 /** 619 * 12.2.4.44 620 */ 621 static int h5_state_bogus_comment(h5_state_t* hs) 622 { 623 const char* idx; 624 625 TRACE(); 626 idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos); 627 if (idx == NULL) { 628 hs->token_start = hs->s + hs->pos; 629 hs->token_len = hs->len - hs->pos; 630 hs->pos = hs->len; 631 hs->state = h5_state_eof; 632 } else { 633 hs->token_start = hs->s + hs->pos; 634 hs->token_len = (size_t)(idx - hs->s) - hs->pos; 635 hs->pos = (size_t)(idx - hs->s) + 1; 636 hs->state = h5_state_data; 637 } 638 639 hs->token_type = TAG_COMMENT; 640 return 1; 641 } 642 643 /** 644 * 12.2.4.44 ALT 645 */ 646 static int h5_state_bogus_comment2(h5_state_t* hs) 647 { 648 const char* idx; 649 size_t pos; 650 651 TRACE(); 652 pos = hs->pos; 653 while (1) { 654 idx = (const char*) memchr(hs->s + pos, CHAR_PERCENT, hs->len - pos); 655 if (idx == NULL || (idx + 1 >= hs->s + hs->len)) { 656 hs->token_start = hs->s + hs->pos; 657 hs->token_len = hs->len - hs->pos; 658 hs->pos = hs->len; 659 hs->token_type = TAG_COMMENT; 660 hs->state = h5_state_eof; 661 return 1; 662 } 663 664 if (*(idx +1) != CHAR_GT) { 665 pos = (size_t)(idx - hs->s) + 1; 666 continue; 667 } 668 669 /* ends in %> */ 670 hs->token_start = hs->s + hs->pos; 671 hs->token_len = (size_t)(idx - hs->s) - hs->pos; 672 hs->pos = (size_t)(idx - hs->s) + 2; 673 hs->state = h5_state_data; 674 hs->token_type = TAG_COMMENT; 675 return 1; 676 } 677 } 678 679 /** 680 * 8.2.4.45 681 */ 682 static int h5_state_markup_declaration_open(h5_state_t* hs) 683 { 684 size_t remaining; 685 686 TRACE(); 687 remaining = hs->len - hs->pos; 688 if (remaining >= 7 && 689 /* case insensitive */ 690 (hs->s[hs->pos + 0] == 'D' || hs->s[hs->pos + 0] == 'd') && 691 (hs->s[hs->pos + 1] == 'O' || hs->s[hs->pos + 1] == 'o') && 692 (hs->s[hs->pos + 2] == 'C' || hs->s[hs->pos + 2] == 'c') && 693 (hs->s[hs->pos + 3] == 'T' || hs->s[hs->pos + 3] == 't') && 694 (hs->s[hs->pos + 4] == 'Y' || hs->s[hs->pos + 4] == 'y') && 695 (hs->s[hs->pos + 5] == 'P' || hs->s[hs->pos + 5] == 'p') && 696 (hs->s[hs->pos + 6] == 'E' || hs->s[hs->pos + 6] == 'e') 697 ) { 698 return h5_state_doctype(hs); 699 } else if (remaining >= 7 && 700 /* upper case required */ 701 hs->s[hs->pos + 0] == '[' && 702 hs->s[hs->pos + 1] == 'C' && 703 hs->s[hs->pos + 2] == 'D' && 704 hs->s[hs->pos + 3] == 'A' && 705 hs->s[hs->pos + 4] == 'T' && 706 hs->s[hs->pos + 5] == 'A' && 707 hs->s[hs->pos + 6] == '[' 708 ) { 709 hs->pos += 7; 710 return h5_state_cdata(hs); 711 } else if (remaining >= 2 && 712 hs->s[hs->pos + 0] == '-' && 713 hs->s[hs->pos + 1] == '-') { 714 hs->pos += 2; 715 return h5_state_comment(hs); 716 } 717 718 return h5_state_bogus_comment(hs); 719 } 720 721 /** 722 * 12.2.4.48 723 * 12.2.4.49 724 * 12.2.4.50 725 * 12.2.4.51 726 * state machine spec is confusing since it can only look 727 * at one character at a time but simply it's comments end by: 728 * 1) EOF 729 * 2) ending in --> 730 * 3) ending in -!> 731 */ 732 static int h5_state_comment(h5_state_t* hs) 733 { 734 char ch; 735 const char* idx; 736 size_t pos; 737 size_t offset; 738 const char* end = hs->s + hs->len; 739 740 TRACE(); 741 pos = hs->pos; 742 while (1) { 743 744 idx = (const char*) memchr(hs->s + pos, CHAR_DASH, hs->len - pos); 745 746 /* did not find anything or has less than 3 chars left */ 747 if (idx == NULL || idx > hs->s + hs->len - 3) { 748 hs->state = h5_state_eof; 749 hs->token_start = hs->s + hs->pos; 750 hs->token_len = hs->len - hs->pos; 751 hs->token_type = TAG_COMMENT; 752 return 1; 753 } 754 offset = 1; 755 756 /* skip all nulls */ 757 while (idx + offset < end && *(idx + offset) == 0) { 758 offset += 1; 759 } 760 if (idx + offset == end) { 761 hs->state = h5_state_eof; 762 hs->token_start = hs->s + hs->pos; 763 hs->token_len = hs->len - hs->pos; 764 hs->token_type = TAG_COMMENT; 765 return 1; 766 } 767 768 ch = *(idx + offset); 769 if (ch != CHAR_DASH && ch != CHAR_BANG) { 770 pos = (size_t)(idx - hs->s) + 1; 771 continue; 772 } 773 774 /* need to test */ 775 #if 0 776 /* skip all nulls */ 777 while (idx + offset < end && *(idx + offset) == 0) { 778 offset += 1; 779 } 780 if (idx + offset == end) { 781 hs->state = h5_state_eof; 782 hs->token_start = hs->s + hs->pos; 783 hs->token_len = hs->len - hs->pos; 784 hs->token_type = TAG_COMMENT; 785 return 1; 786 } 787 #endif 788 789 offset += 1; 790 if (idx + offset == end) { 791 hs->state = h5_state_eof; 792 hs->token_start = hs->s + hs->pos; 793 hs->token_len = hs->len - hs->pos; 794 hs->token_type = TAG_COMMENT; 795 return 1; 796 } 797 798 799 ch = *(idx + offset); 800 if (ch != CHAR_GT) { 801 pos = (size_t)(idx - hs->s) + 1; 802 continue; 803 } 804 offset += 1; 805 806 /* ends in --> or -!> */ 807 hs->token_start = hs->s + hs->pos; 808 hs->token_len = (size_t)(idx - hs->s) - hs->pos; 809 hs->pos = (size_t)(idx + offset - hs->s); 810 hs->state = h5_state_data; 811 hs->token_type = TAG_COMMENT; 812 return 1; 813 } 814 } 815 816 static int h5_state_cdata(h5_state_t* hs) 817 { 818 const char* idx; 819 size_t pos; 820 821 TRACE(); 822 pos = hs->pos; 823 while (1) { 824 idx = (const char*) memchr(hs->s + pos, CHAR_RIGHTB, hs->len - pos); 825 826 /* did not find anything or has less than 3 chars left */ 827 if (idx == NULL || idx > hs->s + hs->len - 3) { 828 hs->state = h5_state_eof; 829 hs->token_start = hs->s + hs->pos; 830 hs->token_len = hs->len - hs->pos; 831 hs->token_type = DATA_TEXT; 832 return 1; 833 } else if ( *(idx+1) == CHAR_RIGHTB && *(idx+2) == CHAR_GT) { 834 hs->state = h5_state_data; 835 hs->token_start = hs->s + hs->pos; 836 hs->token_len = (size_t)(idx - hs->s) - hs->pos; 837 hs->pos = (size_t)(idx - hs->s) + 3; 838 hs->token_type = DATA_TEXT; 839 return 1; 840 } else { 841 pos = (size_t)(idx - hs->s) + 1; 842 } 843 } 844 } 845 846 /** 847 * 8.2.4.52 848 * http://www.w3.org/html/wg/drafts/html/master/syntax.html#doctype-state 849 */ 850 static int h5_state_doctype(h5_state_t* hs) 851 { 852 const char* idx; 853 854 TRACE(); 855 hs->token_start = hs->s + hs->pos; 856 hs->token_type = DOCTYPE; 857 858 idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos); 859 if (idx == NULL) { 860 hs->state = h5_state_eof; 861 hs->token_len = hs->len - hs->pos; 862 } else { 863 hs->state = h5_state_data; 864 hs->token_len = (size_t)(idx - hs->s) - hs->pos; 865 hs->pos = (size_t)(idx - hs->s) + 1; 866 } 867 return 1; 868 }