github.com/TeaOSLab/EdgeNode@v1.3.8/internal/waf/injectionutils/libinjection/src/libinjection_html5.c (about)

     1  #include "libinjection_html5.h"
     2  
     3  #include <string.h>
     4  #include <assert.h>
     5  
     6  #ifdef DEBUG
     7  #include <stdio.h>
     8  #define TRACE() printf("%s:%d\n", __FUNCTION__, __LINE__)
     9  #else
    10  #define TRACE()
    11  #endif
    12  
    13  
    14  #define CHAR_EOF -1
    15  #define CHAR_NULL 0
    16  #define CHAR_BANG 33
    17  #define CHAR_DOUBLE 34
    18  #define CHAR_PERCENT 37
    19  #define CHAR_SINGLE 39
    20  #define CHAR_DASH 45
    21  #define CHAR_SLASH 47
    22  #define CHAR_LT 60
    23  #define CHAR_EQUALS 61
    24  #define CHAR_GT 62
    25  #define CHAR_QUESTION 63
    26  #define CHAR_RIGHTB 93
    27  #define CHAR_TICK 96
    28  
    29  /* prototypes */
    30  
    31  static int h5_skip_white(h5_state_t* hs);
    32  static int h5_is_white(char ch);
    33  static int h5_state_eof(h5_state_t* hs);
    34  static int h5_state_data(h5_state_t* hs);
    35  static int h5_state_tag_open(h5_state_t* hs);
    36  static int h5_state_tag_name(h5_state_t* hs);
    37  static int h5_state_tag_name_close(h5_state_t* hs);
    38  static int h5_state_end_tag_open(h5_state_t* hs);
    39  static int h5_state_self_closing_start_tag(h5_state_t* hs);
    40  static int h5_state_attribute_name(h5_state_t* hs);
    41  static int h5_state_after_attribute_name(h5_state_t* hs);
    42  static int h5_state_before_attribute_name(h5_state_t* hs);
    43  static int h5_state_before_attribute_value(h5_state_t* hs);
    44  static int h5_state_attribute_value_double_quote(h5_state_t* hs);
    45  static int h5_state_attribute_value_single_quote(h5_state_t* hs);
    46  static int h5_state_attribute_value_back_quote(h5_state_t* hs);
    47  static int h5_state_attribute_value_no_quote(h5_state_t* hs);
    48  static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs);
    49  static int h5_state_comment(h5_state_t* hs);
    50  static int h5_state_cdata(h5_state_t* hs);
    51  
    52  
    53  /* 12.2.4.44 */
    54  static int h5_state_bogus_comment(h5_state_t* hs);
    55  static int h5_state_bogus_comment2(h5_state_t* hs);
    56  
    57  /* 12.2.4.45 */
    58  static int h5_state_markup_declaration_open(h5_state_t* hs);
    59  
    60  /* 8.2.4.52 */
    61  static int h5_state_doctype(h5_state_t* hs);
    62  
    63  /**
    64   * public function
    65   */
    66  void libinjection_h5_init(h5_state_t* hs, const char* s, size_t len, enum html5_flags flags)
    67  {
    68      memset(hs, 0, sizeof(h5_state_t));
    69      hs->s = s;
    70      hs->len = len;
    71  
    72      switch (flags) {
    73      case DATA_STATE:
    74          hs->state = h5_state_data;
    75          break;
    76      case VALUE_NO_QUOTE:
    77          hs->state = h5_state_before_attribute_name;
    78          break;
    79      case VALUE_SINGLE_QUOTE:
    80          hs->state = h5_state_attribute_value_single_quote;
    81          break;
    82      case VALUE_DOUBLE_QUOTE:
    83          hs->state = h5_state_attribute_value_double_quote;
    84          break;
    85      case VALUE_BACK_QUOTE:
    86          hs->state = h5_state_attribute_value_back_quote;
    87          break;
    88      }
    89  }
    90  
    91  /**
    92   * public function
    93   */
    94  int libinjection_h5_next(h5_state_t* hs)
    95  {
    96      assert(hs->state != NULL);
    97      return (*hs->state)(hs);
    98  }
    99  
   100  /**
   101   * Everything below here is private
   102   *
   103   */
   104  
   105  
   106  static int h5_is_white(char ch)
   107  {
   108      /*
   109       * \t = horizontal tab = 0x09
   110       * \n = newline = 0x0A
   111       * \v = vertical tab = 0x0B
   112       * \f = form feed = 0x0C
   113       * \r = cr  = 0x0D
   114       */
   115      return strchr(" \t\n\v\f\r", ch) != NULL;
   116  }
   117  
   118  static int h5_skip_white(h5_state_t* hs)
   119  {
   120      char ch;
   121      while (hs->pos < hs->len) {
   122          ch = hs->s[hs->pos];
   123          switch (ch) {
   124          case 0x00: /* IE only */
   125          case 0x20:
   126          case 0x09:
   127          case 0x0A:
   128          case 0x0B: /* IE only */
   129          case 0x0C:
   130          case 0x0D: /* IE only */
   131              hs->pos += 1;
   132              break;
   133          default:
   134              return ch;
   135          }
   136      }
   137      return CHAR_EOF;
   138  }
   139  
   140  static int h5_state_eof(h5_state_t* hs)
   141  {
   142      /* eliminate unused function argument warning */
   143      (void)hs;
   144      return 0;
   145  }
   146  
   147  static int h5_state_data(h5_state_t* hs)
   148  {
   149      const char* idx;
   150  
   151      TRACE();
   152      assert(hs->len >= hs->pos);
   153      idx = (const char*) memchr(hs->s + hs->pos, CHAR_LT, hs->len - hs->pos);
   154      if (idx == NULL) {
   155          hs->token_start = hs->s + hs->pos;
   156          hs->token_len = hs->len - hs->pos;
   157          hs->token_type = DATA_TEXT;
   158          hs->state = h5_state_eof;
   159          if (hs->token_len == 0) {
   160              return 0;
   161          }
   162      } else {
   163          hs->token_start = hs->s + hs->pos;
   164          hs->token_type = DATA_TEXT;
   165          hs->token_len = (size_t)(idx - hs->s) - hs->pos;
   166          hs->pos = (size_t)(idx - hs->s) + 1;
   167          hs->state = h5_state_tag_open;
   168          if (hs->token_len == 0) {
   169              return h5_state_tag_open(hs);
   170          }
   171      }
   172      return 1;
   173  }
   174  
   175  /**
   176   * 12 2.4.8
   177   */
   178  static int h5_state_tag_open(h5_state_t* hs)
   179  {
   180      char ch;
   181  
   182      TRACE();
   183      if (hs->pos >= hs->len) {
   184          return 0;
   185      }
   186      ch = hs->s[hs->pos];
   187      if (ch == CHAR_BANG) {
   188          hs->pos += 1;
   189          return h5_state_markup_declaration_open(hs);
   190      } else if (ch == CHAR_SLASH) {
   191          hs->pos += 1;
   192          hs->is_close = 1;
   193          return h5_state_end_tag_open(hs);
   194      } else if (ch == CHAR_QUESTION) {
   195          hs->pos += 1;
   196          return h5_state_bogus_comment(hs);
   197      } else if (ch == CHAR_PERCENT) {
   198          /* this is not in spec.. alternative comment format used
   199             by IE <= 9 and Safari < 4.0.3 */
   200          hs->pos += 1;
   201          return h5_state_bogus_comment2(hs);
   202      } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
   203          return h5_state_tag_name(hs);
   204      } else if (ch == CHAR_NULL) {
   205          /* IE-ism  NULL characters are ignored */
   206          return h5_state_tag_name(hs);
   207      } else {
   208          /* user input mistake in configuring state */
   209          if (hs->pos == 0) {
   210              return h5_state_data(hs);
   211          }
   212          hs->token_start = hs->s + hs->pos - 1;
   213          hs->token_len = 1;
   214          hs->token_type = DATA_TEXT;
   215          hs->state = h5_state_data;
   216          return 1;
   217      }
   218  }
   219  /**
   220   * 12.2.4.9
   221   */
   222  static int h5_state_end_tag_open(h5_state_t* hs)
   223  {
   224      char ch;
   225  
   226      TRACE();
   227  
   228      if (hs->pos >= hs->len) {
   229          return 0;
   230      }
   231      ch = hs->s[hs->pos];
   232      if (ch == CHAR_GT) {
   233          return h5_state_data(hs);
   234      } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
   235          return h5_state_tag_name(hs);
   236      }
   237  
   238      hs->is_close = 0;
   239      return h5_state_bogus_comment(hs);
   240  }
   241  /*
   242   *
   243   */
   244  static int h5_state_tag_name_close(h5_state_t* hs)
   245  {
   246      TRACE();
   247      hs->is_close = 0;
   248      hs->token_start = hs->s + hs->pos;
   249      hs->token_len = 1;
   250      hs->token_type = TAG_NAME_CLOSE;
   251      hs->pos += 1;
   252      if (hs->pos < hs->len) {
   253          hs->state = h5_state_data;
   254      } else {
   255          hs->state = h5_state_eof;
   256      }
   257  
   258      return 1;
   259  }
   260  
   261  /**
   262   * 12.2.4.10
   263   */
   264  static int h5_state_tag_name(h5_state_t* hs)
   265  {
   266      char ch;
   267      size_t pos;
   268  
   269      TRACE();
   270      pos = hs->pos;
   271      while (pos < hs->len) {
   272          ch = hs->s[pos];
   273          if (ch == 0) {
   274              /* special non-standard case */
   275              /* allow nulls in tag name   */
   276              /* some old browsers apparently allow and ignore them */
   277              pos += 1;
   278          } else if (h5_is_white(ch)) {
   279              hs->token_start = hs->s + hs->pos;
   280              hs->token_len = pos - hs->pos;
   281              hs->token_type = TAG_NAME_OPEN;
   282              hs->pos = pos + 1;
   283              hs->state = h5_state_before_attribute_name;
   284              return 1;
   285          } else if (ch == CHAR_SLASH) {
   286              hs->token_start = hs->s + hs->pos;
   287              hs->token_len = pos - hs->pos;
   288              hs->token_type = TAG_NAME_OPEN;
   289              hs->pos = pos + 1;
   290              hs->state = h5_state_self_closing_start_tag;
   291              return 1;
   292          } else if (ch == CHAR_GT) {
   293              hs->token_start = hs->s + hs->pos;
   294              hs->token_len = pos - hs->pos;
   295              if (hs->is_close) {
   296                  hs->pos = pos + 1;
   297                  hs->is_close = 0;
   298                  hs->token_type = TAG_CLOSE;
   299                  hs->state = h5_state_data;
   300              } else {
   301                  hs->pos = pos;
   302                  hs->token_type = TAG_NAME_OPEN;
   303                  hs->state = h5_state_tag_name_close;
   304              }
   305              return 1;
   306          } else {
   307              pos += 1;
   308          }
   309      }
   310  
   311      hs->token_start = hs->s + hs->pos;
   312      hs->token_len = hs->len - hs->pos;
   313      hs->token_type = TAG_NAME_OPEN;
   314      hs->state = h5_state_eof;
   315      return 1;
   316  }
   317  
   318  /**
   319   * 12.2.4.34
   320   */
   321  static int h5_state_before_attribute_name(h5_state_t* hs)
   322  {
   323      int ch;
   324  
   325      TRACE();
   326  
   327      /* for manual tail call optimization, see comment below */
   328      tail_call:;
   329  
   330      ch = h5_skip_white(hs);
   331      switch (ch) {
   332      case CHAR_EOF: {
   333          return 0;
   334      }
   335      case CHAR_SLASH: {
   336          hs->pos += 1;
   337          /* Logically, We want to call h5_state_self_closing_start_tag(hs) here.
   338  
   339             As this function may call us back and the compiler
   340             might not implement automatic tail call optimization,
   341             this might result in a deep recursion.
   342  
   343             We detect this case here and start over with the current state.
   344          */
   345  
   346          if (hs->pos < hs->len && hs->s[hs->pos] != CHAR_GT) {
   347              goto tail_call;
   348          }
   349          return h5_state_self_closing_start_tag(hs);
   350      }
   351      case CHAR_GT: {
   352          hs->state = h5_state_data;
   353          hs->token_start = hs->s + hs->pos;
   354          hs->token_len = 1;
   355          hs->token_type = TAG_NAME_CLOSE;
   356          hs->pos += 1;
   357          return 1;
   358      }
   359      default: {
   360          return h5_state_attribute_name(hs);
   361      }
   362      }
   363  }
   364  
   365  static int h5_state_attribute_name(h5_state_t* hs)
   366  {
   367      char ch;
   368      size_t pos;
   369  
   370      TRACE();
   371      pos = hs->pos + 1;
   372      while (pos < hs->len) {
   373          ch = hs->s[pos];
   374          if (h5_is_white(ch)) {
   375              hs->token_start = hs->s + hs->pos;
   376              hs->token_len   = pos - hs->pos;
   377              hs->token_type  = ATTR_NAME;
   378              hs->state = h5_state_after_attribute_name;
   379              hs->pos = pos + 1;
   380              return 1;
   381          } else if (ch == CHAR_SLASH) {
   382              hs->token_start = hs->s + hs->pos;
   383              hs->token_len   = pos - hs->pos;
   384              hs->token_type  = ATTR_NAME;
   385              hs->state = h5_state_self_closing_start_tag;
   386              hs->pos = pos + 1;
   387              return 1;
   388          } else if (ch == CHAR_EQUALS) {
   389              hs->token_start = hs->s + hs->pos;
   390              hs->token_len   = pos - hs->pos;
   391              hs->token_type  = ATTR_NAME;
   392              hs->state = h5_state_before_attribute_value;
   393              hs->pos = pos + 1;
   394              return 1;
   395          } else if (ch == CHAR_GT) {
   396              hs->token_start = hs->s + hs->pos;
   397              hs->token_len   = pos - hs->pos;
   398              hs->token_type  = ATTR_NAME;
   399              hs->state = h5_state_tag_name_close;
   400              hs->pos = pos;
   401              return 1;
   402          } else {
   403              pos += 1;
   404          }
   405      }
   406      /* EOF */
   407      hs->token_start = hs->s + hs->pos;
   408      hs->token_len   = hs->len - hs->pos;
   409      hs->token_type  = ATTR_NAME;
   410      hs->state = h5_state_eof;
   411      hs->pos = hs->len;
   412      return 1;
   413  }
   414  
   415  /**
   416   * 12.2.4.36
   417   */
   418  static int h5_state_after_attribute_name(h5_state_t* hs)
   419  {
   420      int c;
   421  
   422      TRACE();
   423      c = h5_skip_white(hs);
   424      switch (c) {
   425      case CHAR_EOF: {
   426          return 0;
   427      }
   428      case CHAR_SLASH: {
   429          hs->pos += 1;
   430          return h5_state_self_closing_start_tag(hs);
   431      }
   432      case CHAR_EQUALS: {
   433          hs->pos += 1;
   434          return h5_state_before_attribute_value(hs);
   435      }
   436      case CHAR_GT: {
   437          return h5_state_tag_name_close(hs);
   438      }
   439      default: {
   440          return h5_state_attribute_name(hs);
   441      }
   442      }
   443  }
   444  
   445  /**
   446   * 12.2.4.37
   447   */
   448  static int h5_state_before_attribute_value(h5_state_t* hs)
   449  {
   450      int c;
   451      TRACE();
   452  
   453      c = h5_skip_white(hs);
   454  
   455      if (c == CHAR_EOF) {
   456          hs->state = h5_state_eof;
   457          return 0;
   458      }
   459  
   460      if (c == CHAR_DOUBLE) {
   461          return h5_state_attribute_value_double_quote(hs);
   462      } else if (c == CHAR_SINGLE) {
   463          return h5_state_attribute_value_single_quote(hs);
   464      } else if (c == CHAR_TICK) {
   465          /* NON STANDARD IE */
   466          return h5_state_attribute_value_back_quote(hs);
   467      } else {
   468          return h5_state_attribute_value_no_quote(hs);
   469      }
   470  }
   471  
   472  
   473  static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
   474  {
   475      const char* idx;
   476  
   477      TRACE();
   478  
   479      /* skip initial quote in normal case.
   480       * don't do this "if (pos == 0)" since it means we have started
   481       * in a non-data state.  given an input of '><foo
   482       * we want to make 0-length attribute name
   483       */
   484      if (hs->pos > 0) {
   485          hs->pos += 1;
   486      }
   487  
   488  
   489      idx = (const char*) memchr(hs->s + hs->pos, qchar, hs->len - hs->pos);
   490      if (idx == NULL) {
   491          hs->token_start = hs->s + hs->pos;
   492          hs->token_len = hs->len - hs->pos;
   493          hs->token_type = ATTR_VALUE;
   494          hs->state = h5_state_eof;
   495      } else {
   496          hs->token_start = hs->s + hs->pos;
   497          hs->token_len = (size_t)(idx - hs->s) - hs->pos;
   498          hs->token_type = ATTR_VALUE;
   499          hs->state = h5_state_after_attribute_value_quoted_state;
   500          hs->pos += hs->token_len + 1;
   501      }
   502      return 1;
   503  }
   504  
   505  static
   506  int h5_state_attribute_value_double_quote(h5_state_t* hs)
   507  {
   508      TRACE();
   509      return h5_state_attribute_value_quote(hs, CHAR_DOUBLE);
   510  }
   511  
   512  static
   513  int h5_state_attribute_value_single_quote(h5_state_t* hs)
   514  {
   515      TRACE();
   516      return h5_state_attribute_value_quote(hs, CHAR_SINGLE);
   517  }
   518  
   519  static
   520  int h5_state_attribute_value_back_quote(h5_state_t* hs)
   521  {
   522      TRACE();
   523      return h5_state_attribute_value_quote(hs, CHAR_TICK);
   524  }
   525  
   526  static int h5_state_attribute_value_no_quote(h5_state_t* hs)
   527  {
   528      char ch;
   529      size_t pos;
   530  
   531      TRACE();
   532      pos = hs->pos;
   533      while (pos < hs->len) {
   534          ch = hs->s[pos];
   535          if (h5_is_white(ch)) {
   536              hs->token_type = ATTR_VALUE;
   537              hs->token_start = hs->s + hs->pos;
   538              hs->token_len = pos - hs->pos;
   539              hs->pos = pos + 1;
   540              hs->state = h5_state_before_attribute_name;
   541              return 1;
   542          } else if (ch == CHAR_GT) {
   543              hs->token_type = ATTR_VALUE;
   544              hs->token_start = hs->s + hs->pos;
   545              hs->token_len = pos - hs->pos;
   546              hs->pos = pos;
   547              hs->state = h5_state_tag_name_close;
   548              return 1;
   549          }
   550          pos += 1;
   551      }
   552      TRACE();
   553      /* EOF */
   554      hs->state = h5_state_eof;
   555      hs->token_start = hs->s + hs->pos;
   556      hs->token_len = hs->len - hs->pos;
   557      hs->token_type = ATTR_VALUE;
   558      return 1;
   559  }
   560  
   561  /**
   562   * 12.2.4.41
   563   */
   564  static int h5_state_after_attribute_value_quoted_state(h5_state_t* hs)
   565  {
   566      char ch;
   567  
   568      TRACE();
   569      if (hs->pos >= hs->len) {
   570          return 0;
   571      }
   572      ch = hs->s[hs->pos];
   573      if (h5_is_white(ch)) {
   574          hs->pos += 1;
   575          return h5_state_before_attribute_name(hs);
   576      } else if (ch == CHAR_SLASH) {
   577          hs->pos += 1;
   578          return h5_state_self_closing_start_tag(hs);
   579      } else if (ch == CHAR_GT) {
   580          hs->token_start = hs->s + hs->pos;
   581          hs->token_len = 1;
   582          hs->token_type = TAG_NAME_CLOSE;
   583          hs->pos += 1;
   584          hs->state = h5_state_data;
   585          return 1;
   586      } else {
   587          return h5_state_before_attribute_name(hs);
   588      }
   589  }
   590  
   591  /**
   592   * 12.2.4.43
   593   *
   594   *  WARNING: This function is partially inlined into h5_state_before_attribute_name()
   595   */
   596  static int h5_state_self_closing_start_tag(h5_state_t* hs)
   597  {
   598      char ch;
   599  
   600      TRACE();
   601      if (hs->pos >= hs->len) {
   602          return 0;
   603      }
   604      ch = hs->s[hs->pos];
   605      if (ch == CHAR_GT) {
   606          assert(hs->pos > 0);
   607          hs->token_start = hs->s + hs->pos -1;
   608          hs->token_len = 2;
   609          hs->token_type = TAG_NAME_SELFCLOSE;
   610          hs->state = h5_state_data;
   611          hs->pos += 1;
   612          return 1;
   613      } else {
   614          return h5_state_before_attribute_name(hs);
   615      }
   616  }
   617  
   618  /**
   619   * 12.2.4.44
   620   */
   621  static int h5_state_bogus_comment(h5_state_t* hs)
   622  {
   623      const char* idx;
   624  
   625      TRACE();
   626      idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
   627      if (idx == NULL) {
   628          hs->token_start = hs->s + hs->pos;
   629          hs->token_len = hs->len - hs->pos;
   630          hs->pos = hs->len;
   631          hs->state = h5_state_eof;
   632      } else {
   633          hs->token_start = hs->s + hs->pos;
   634          hs->token_len = (size_t)(idx - hs->s) - hs->pos;
   635          hs->pos =  (size_t)(idx - hs->s) + 1;
   636          hs->state = h5_state_data;
   637      }
   638  
   639      hs->token_type = TAG_COMMENT;
   640      return 1;
   641  }
   642  
   643  /**
   644   * 12.2.4.44 ALT
   645   */
   646  static int h5_state_bogus_comment2(h5_state_t* hs)
   647  {
   648      const char* idx;
   649      size_t pos;
   650  
   651      TRACE();
   652      pos = hs->pos;
   653      while (1) {
   654          idx = (const char*) memchr(hs->s + pos, CHAR_PERCENT, hs->len - pos);
   655          if (idx == NULL || (idx + 1 >= hs->s + hs->len)) {
   656              hs->token_start = hs->s + hs->pos;
   657              hs->token_len = hs->len - hs->pos;
   658              hs->pos = hs->len;
   659              hs->token_type = TAG_COMMENT;
   660              hs->state = h5_state_eof;
   661              return 1;
   662          }
   663  
   664          if (*(idx +1) != CHAR_GT) {
   665              pos = (size_t)(idx - hs->s) + 1;
   666              continue;
   667          }
   668  
   669          /* ends in %> */
   670          hs->token_start = hs->s + hs->pos;
   671          hs->token_len = (size_t)(idx - hs->s) - hs->pos;
   672          hs->pos = (size_t)(idx - hs->s) + 2;
   673          hs->state = h5_state_data;
   674          hs->token_type = TAG_COMMENT;
   675          return 1;
   676      }
   677  }
   678  
   679  /**
   680   * 8.2.4.45
   681   */
   682  static int h5_state_markup_declaration_open(h5_state_t* hs)
   683  {
   684      size_t remaining;
   685  
   686      TRACE();
   687      remaining = hs->len - hs->pos;
   688      if (remaining >= 7 &&
   689          /* case insensitive */
   690          (hs->s[hs->pos + 0] == 'D' || hs->s[hs->pos + 0] == 'd') &&
   691          (hs->s[hs->pos + 1] == 'O' || hs->s[hs->pos + 1] == 'o') &&
   692          (hs->s[hs->pos + 2] == 'C' || hs->s[hs->pos + 2] == 'c') &&
   693          (hs->s[hs->pos + 3] == 'T' || hs->s[hs->pos + 3] == 't') &&
   694          (hs->s[hs->pos + 4] == 'Y' || hs->s[hs->pos + 4] == 'y') &&
   695          (hs->s[hs->pos + 5] == 'P' || hs->s[hs->pos + 5] == 'p') &&
   696          (hs->s[hs->pos + 6] == 'E' || hs->s[hs->pos + 6] == 'e')
   697          ) {
   698          return h5_state_doctype(hs);
   699      } else if (remaining >= 7 &&
   700                 /* upper case required */
   701                 hs->s[hs->pos + 0] == '[' &&
   702                 hs->s[hs->pos + 1] == 'C' &&
   703                 hs->s[hs->pos + 2] == 'D' &&
   704                 hs->s[hs->pos + 3] == 'A' &&
   705                 hs->s[hs->pos + 4] == 'T' &&
   706                 hs->s[hs->pos + 5] == 'A' &&
   707                 hs->s[hs->pos + 6] == '['
   708          ) {
   709          hs->pos += 7;
   710          return h5_state_cdata(hs);
   711      } else if (remaining >= 2 &&
   712                 hs->s[hs->pos + 0] == '-' &&
   713                 hs->s[hs->pos + 1] == '-') {
   714          hs->pos += 2;
   715          return h5_state_comment(hs);
   716      }
   717  
   718      return h5_state_bogus_comment(hs);
   719  }
   720  
   721  /**
   722   * 12.2.4.48
   723   * 12.2.4.49
   724   * 12.2.4.50
   725   * 12.2.4.51
   726   *   state machine spec is confusing since it can only look
   727   *   at one character at a time but simply it's comments end by:
   728   *   1) EOF
   729   *   2) ending in -->
   730   *   3) ending in -!>
   731   */
   732  static int h5_state_comment(h5_state_t* hs)
   733  {
   734      char ch;
   735      const char* idx;
   736      size_t pos;
   737      size_t offset;
   738      const char* end = hs->s + hs->len;
   739  
   740      TRACE();
   741      pos = hs->pos;
   742      while (1) {
   743  
   744          idx = (const char*) memchr(hs->s + pos, CHAR_DASH, hs->len - pos);
   745  
   746          /* did not find anything or has less than 3 chars left */
   747          if (idx == NULL || idx > hs->s + hs->len - 3) {
   748              hs->state = h5_state_eof;
   749              hs->token_start = hs->s + hs->pos;
   750              hs->token_len = hs->len - hs->pos;
   751              hs->token_type = TAG_COMMENT;
   752              return 1;
   753          }
   754          offset = 1;
   755  
   756          /* skip all nulls */
   757          while (idx + offset < end && *(idx + offset) == 0) {
   758              offset += 1;
   759          }
   760          if (idx + offset == end) {
   761              hs->state = h5_state_eof;
   762              hs->token_start = hs->s + hs->pos;
   763              hs->token_len = hs->len - hs->pos;
   764              hs->token_type = TAG_COMMENT;
   765              return 1;
   766          }
   767  
   768          ch = *(idx + offset);
   769          if (ch != CHAR_DASH && ch != CHAR_BANG) {
   770              pos = (size_t)(idx - hs->s) + 1;
   771              continue;
   772          }
   773  
   774          /* need to test */
   775  #if 0
   776          /* skip all nulls */
   777          while (idx + offset < end && *(idx + offset) == 0) {
   778              offset += 1;
   779          }
   780          if (idx + offset == end) {
   781              hs->state = h5_state_eof;
   782              hs->token_start = hs->s + hs->pos;
   783              hs->token_len = hs->len - hs->pos;
   784              hs->token_type = TAG_COMMENT;
   785              return 1;
   786          }
   787  #endif
   788  
   789          offset += 1;
   790          if (idx + offset == end) {
   791              hs->state = h5_state_eof;
   792              hs->token_start = hs->s + hs->pos;
   793              hs->token_len = hs->len - hs->pos;
   794              hs->token_type = TAG_COMMENT;
   795              return 1;
   796          }
   797  
   798  
   799          ch = *(idx + offset);
   800          if (ch != CHAR_GT) {
   801              pos = (size_t)(idx - hs->s) + 1;
   802              continue;
   803          }
   804          offset += 1;
   805  
   806          /* ends in --> or -!> */
   807          hs->token_start = hs->s + hs->pos;
   808          hs->token_len = (size_t)(idx - hs->s) - hs->pos;
   809          hs->pos = (size_t)(idx + offset - hs->s);
   810          hs->state = h5_state_data;
   811          hs->token_type = TAG_COMMENT;
   812          return 1;
   813      }
   814  }
   815  
   816  static int h5_state_cdata(h5_state_t* hs)
   817  {
   818      const char* idx;
   819      size_t pos;
   820  
   821      TRACE();
   822      pos = hs->pos;
   823      while (1) {
   824          idx = (const char*) memchr(hs->s + pos, CHAR_RIGHTB, hs->len - pos);
   825  
   826          /* did not find anything or has less than 3 chars left */
   827          if (idx == NULL || idx > hs->s + hs->len - 3) {
   828              hs->state = h5_state_eof;
   829              hs->token_start = hs->s + hs->pos;
   830              hs->token_len = hs->len - hs->pos;
   831              hs->token_type = DATA_TEXT;
   832              return 1;
   833          } else if ( *(idx+1) == CHAR_RIGHTB && *(idx+2) == CHAR_GT) {
   834              hs->state = h5_state_data;
   835              hs->token_start = hs->s + hs->pos;
   836              hs->token_len = (size_t)(idx - hs->s) - hs->pos;
   837              hs->pos = (size_t)(idx - hs->s) + 3;
   838              hs->token_type = DATA_TEXT;
   839              return 1;
   840          } else {
   841              pos = (size_t)(idx - hs->s) + 1;
   842          }
   843      }
   844  }
   845  
   846  /**
   847   * 8.2.4.52
   848   * http://www.w3.org/html/wg/drafts/html/master/syntax.html#doctype-state
   849   */
   850  static int h5_state_doctype(h5_state_t* hs)
   851  {
   852      const char* idx;
   853  
   854      TRACE();
   855      hs->token_start = hs->s + hs->pos;
   856      hs->token_type = DOCTYPE;
   857  
   858      idx = (const char*) memchr(hs->s + hs->pos, CHAR_GT, hs->len - hs->pos);
   859      if (idx == NULL) {
   860          hs->state = h5_state_eof;
   861          hs->token_len = hs->len - hs->pos;
   862      } else {
   863          hs->state = h5_state_data;
   864          hs->token_len = (size_t)(idx - hs->s) - hs->pos;
   865          hs->pos = (size_t)(idx - hs->s) + 1;
   866      }
   867      return 1;
   868  }