github.com/TeaOSLab/EdgeNode@v1.3.8/internal/waf/injectionutils/libinjection/src/libinjection_sqli.c (about)

     1  /**
     2   * Copyright 2012,2016  Nick Galbreath
     3   * nickg@client9.com
     4   * BSD License -- see COPYING.txt for details
     5   *
     6   * https://libinjection.client9.com/
     7   *
     8   */
     9  
    10  #include <string.h>
    11  #include <stdlib.h>
    12  #include <stdio.h>
    13  #include <ctype.h>
    14  #include <assert.h>
    15  #include <stddef.h>
    16  
    17  #include "libinjection.h"
    18  #include "libinjection_sqli.h"
    19  #include "libinjection_sqli_data.h"
    20  
    21  #ifdef __clang_analyzer__
    22  // make clang analyzer happy by defining a dummy version
    23  #define LIBINJECTION_VERSION "undefined"
    24  #endif
    25  
    26  #define LIBINJECTION_SQLI_TOKEN_SIZE  sizeof(((stoken_t*)(0))->val)
    27  #define LIBINJECTION_SQLI_MAX_TOKENS  5
    28  
    29  #ifndef TRUE
    30  #define TRUE 1
    31  #endif
    32  #ifndef FALSE
    33  #define FALSE 0
    34  #endif
    35  
    36  #define CHAR_NULL    '\0'
    37  #define CHAR_SINGLE  '\''
    38  #define CHAR_DOUBLE  '"'
    39  #define CHAR_TICK    '`'
    40  
    41  /* faster than calling out to libc isdigit */
    42  #define ISDIGIT(a) ((unsigned)((a) - '0') <= 9)
    43  
    44  #if 0
    45  #define FOLD_DEBUG printf("%d \t more=%d  pos=%d left=%d\n", __LINE__, more, (int)pos, (int)left);
    46  #else
    47  #define FOLD_DEBUG
    48  #endif
    49  
    50  /*
    51   * not making public just yet
    52   */
    53  typedef enum {
    54      TYPE_NONE        = 0
    55      , TYPE_KEYWORD     = (int)'k'
    56      , TYPE_UNION       = (int)'U'
    57      , TYPE_GROUP       = (int)'B'
    58      , TYPE_EXPRESSION  = (int)'E'
    59      , TYPE_SQLTYPE     = (int)'t'
    60      , TYPE_FUNCTION    = (int)'f'
    61      , TYPE_BAREWORD    = (int)'n'
    62      , TYPE_NUMBER      = (int)'1'
    63      , TYPE_VARIABLE    = (int)'v'
    64      , TYPE_STRING      = (int)'s'
    65      , TYPE_OPERATOR    = (int)'o'
    66      , TYPE_LOGIC_OPERATOR = (int)'&'
    67      , TYPE_COMMENT     = (int)'c'
    68      , TYPE_COLLATE     = (int)'A'
    69      , TYPE_LEFTPARENS  = (int)'('
    70      , TYPE_RIGHTPARENS = (int)')'  /* not used? */
    71      , TYPE_LEFTBRACE   = (int)'{'
    72      , TYPE_RIGHTBRACE  = (int)'}'
    73      , TYPE_DOT         = (int)'.'
    74      , TYPE_COMMA       = (int)','
    75      , TYPE_COLON       = (int)':'
    76      , TYPE_SEMICOLON   = (int)';'
    77      , TYPE_TSQL        = (int)'T'  /* TSQL start */
    78      , TYPE_UNKNOWN     = (int)'?'
    79      , TYPE_EVIL        = (int)'X'  /* unparsable, abort  */
    80      , TYPE_FINGERPRINT = (int)'F'  /* not really a token */
    81      , TYPE_BACKSLASH   = (int)'\\'
    82  } sqli_token_types;
    83  
    84  /**
    85   * Initializes parsing state
    86   *
    87   */
    88  static char flag2delim(int flag)
    89  {
    90      if (flag & FLAG_QUOTE_SINGLE) {
    91          return CHAR_SINGLE;
    92      } else if (flag & FLAG_QUOTE_DOUBLE) {
    93          return CHAR_DOUBLE;
    94      } else {
    95          return CHAR_NULL;
    96      }
    97  }
    98  
    99  /* memchr2 finds a string of 2 characters inside another string
   100   * This a specialized version of "memmem" or "memchr".
   101   * 'memmem' doesn't exist on all platforms
   102   *
   103   * Porting notes: this is just a special version of
   104   *    astring.find("AB")
   105   *
   106   */
   107  static const char *
   108  memchr2(const char *haystack, size_t haystack_len, char c0, char c1)
   109  {
   110      const char *cur = haystack;
   111      const char *last = haystack + haystack_len - 1;
   112  
   113      if (haystack_len < 2) {
   114          return NULL;
   115      }
   116  
   117      while (cur < last) {
   118          /* safe since cur < len - 1 always */
   119          if (cur[0] == c0 && cur[1] == c1) {
   120              return cur;
   121          }
   122          cur += 1;
   123      }
   124  
   125      return NULL;
   126  }
   127  
   128  /**
   129   * memmem might not exist on some systems
   130   */
   131  static const char *
   132  my_memmem(const char* haystack, size_t hlen, const char* needle, size_t nlen)
   133  {
   134      const char* cur;
   135      const char* last;
   136      assert(haystack);
   137      assert(needle);
   138      assert(nlen > 1);
   139      last =  haystack + hlen - nlen;
   140      for (cur = haystack; cur <= last; ++cur) {
   141          if (cur[0] == needle[0] && memcmp(cur, needle, nlen) == 0) {
   142              return cur;
   143          }
   144      }
   145      return NULL;
   146  }
   147  
   148  /** Find largest string containing certain characters.
   149   *
   150   * C Standard library 'strspn' only works for 'c-strings' (null terminated)
   151   * This works on arbitrary length.
   152   *
   153   * Performance notes:
   154   *   not critical
   155   *
   156   * Porting notes:
   157   *   if accept is 'ABC', then this function would be similar to
   158   *   a_regexp.match(a_str, '[ABC]*'),
   159   */
   160  static size_t
   161  strlenspn(const char *s, size_t len, const char *accept)
   162  {
   163      size_t i;
   164      for (i = 0; i < len; ++i) {
   165          /* likely we can do better by inlining this function
   166           * but this works for now
   167           */
   168          if (strchr(accept, s[i]) == NULL) {
   169              return i;
   170          }
   171      }
   172      return len;
   173  }
   174  
   175  static size_t
   176  strlencspn(const char *s, size_t len, const char *accept)
   177  {
   178      size_t i;
   179      for (i = 0; i < len; ++i) {
   180          /* likely we can do better by inlining this function
   181           * but this works for now
   182           */
   183          if (strchr(accept, s[i]) != NULL) {
   184              return i;
   185          }
   186      }
   187      return len;
   188  }
   189  static int char_is_white(char ch) {
   190      /* ' '  space is 0x32
   191         '\t  0x09 \011 horizontal tab
   192         '\n' 0x0a \012 new line
   193         '\v' 0x0b \013 vertical tab
   194         '\f' 0x0c \014 new page
   195         '\r' 0x0d \015 carriage return
   196              0x00 \000 null (oracle)
   197              0xa0 \240 is Latin-1
   198      */
   199      return strchr(" \t\n\v\f\r\240\000", ch) != NULL;
   200  }
   201  
   202  /* DANGER DANGER
   203   * This is -very specialized function-
   204   *
   205   * this compares a ALL_UPPER CASE C STRING
   206   * with a *arbitrary memory* + length
   207   *
   208   * Sane people would just make a copy, up-case
   209   * and use a hash table.
   210   *
   211   * Required since libc version uses the current locale
   212   * and is much slower.
   213   */
   214  static int cstrcasecmp(const char *a, const char *b, size_t n)
   215  {
   216      char cb;
   217  
   218      for (; n > 0; a++, b++, n--) {
   219          cb = *b;
   220          if (cb >= 'a' && cb <= 'z') {
   221              cb -= 0x20;
   222          }
   223          if (*a != cb) {
   224              return *a - cb;
   225          } else if (*a == '\0') {
   226              return -1;
   227          }
   228      }
   229  
   230      return (*a == 0) ? 0 : 1;
   231  }
   232  
   233  /**
   234   * Case sensitive string compare.
   235   *  Here only to make code more readable
   236   */
   237  static int streq(const char *a, const char *b)
   238  {
   239      return strcmp(a, b) == 0;
   240  }
   241  
   242  /**
   243   *
   244   *
   245   *
   246   * Porting Notes:
   247   *  given a mapping/hash of string to char
   248   *  this is just
   249   *    typecode = mapping[key.upper()]
   250   */
   251  
   252  static char bsearch_keyword_type(const char *key, size_t len,
   253                                   const keyword_t * keywords, size_t numb)
   254  {
   255      size_t pos;
   256      size_t left = 0;
   257      size_t right = numb - 1;
   258  
   259      while (left < right) {
   260          pos = (left + right) >> 1;
   261  
   262          /* arg0 = upper case only, arg1 = mixed case */
   263          if (cstrcasecmp(keywords[pos].word, key, len) < 0) {
   264              left = pos + 1;
   265          } else {
   266              right = pos;
   267          }
   268      }
   269      if ((left == right) && cstrcasecmp(keywords[left].word, key, len) == 0) {
   270          return keywords[left].type;
   271      } else {
   272          return CHAR_NULL;
   273      }
   274  }
   275  
   276  static char is_keyword(const char* key, size_t len)
   277  {
   278      return bsearch_keyword_type(key, len, sql_keywords, sql_keywords_sz);
   279  }
   280  
   281  /* st_token methods
   282   *
   283   * The following functions manipulates the stoken_t type
   284   *
   285   *
   286   */
   287  
   288  static void st_clear(stoken_t * st)
   289  {
   290      memset(st, 0, sizeof(stoken_t));
   291  }
   292  
   293  static void st_assign_char(stoken_t * st, const char stype, size_t pos, size_t len,
   294                             const char value)
   295  {
   296      /* done to eliminate unused warning */
   297      (void)len;
   298      st->type = (char) stype;
   299      st->pos = pos;
   300      st->len = 1;
   301      st->val[0] = value;
   302      st->val[1] = CHAR_NULL;
   303  }
   304  
   305  static void st_assign(stoken_t * st, const char stype,
   306                        size_t pos, size_t len, const char* value)
   307  {
   308      const size_t MSIZE = LIBINJECTION_SQLI_TOKEN_SIZE;
   309      size_t last = len < MSIZE ? len : (MSIZE - 1);
   310      st->type = (char) stype;
   311      st->pos = pos;
   312      st->len = last;
   313      memcpy(st->val, value, last);
   314      st->val[last] = CHAR_NULL;
   315  }
   316  
   317  static void st_copy(stoken_t * dest, const stoken_t * src)
   318  {
   319      memcpy(dest, src, sizeof(stoken_t));
   320  }
   321  
   322  static int st_is_arithmetic_op(const stoken_t* st)
   323  {
   324      const char ch = st->val[0];
   325      return (st->type == TYPE_OPERATOR && st->len == 1 &&
   326              (ch == '*' || ch == '/' || ch == '-' || ch == '+' || ch == '%'));
   327  }
   328  
   329  static int st_is_unary_op(const stoken_t * st)
   330  {
   331      const char* str = st->val;
   332      const size_t len = st->len;
   333  
   334      if (st->type != TYPE_OPERATOR) {
   335          return FALSE;
   336      }
   337  
   338      switch (len) {
   339      case 1:
   340          return *str == '+' || *str == '-' || *str == '!' || *str == '~';
   341      case 2:
   342          return str[0] == '!' && str[1] == '!';
   343      case 3:
   344          return cstrcasecmp("NOT", str, 3) == 0;
   345      default:
   346          return FALSE;
   347      }
   348  }
   349  
   350  /* Parsers
   351   *
   352   *
   353   */
   354  
   355  static size_t parse_white(struct libinjection_sqli_state * sf)
   356  {
   357      return sf->pos + 1;
   358  }
   359  
   360  static size_t parse_operator1(struct libinjection_sqli_state * sf)
   361  {
   362      const char *cs = sf->s;
   363      size_t pos = sf->pos;
   364  
   365      st_assign_char(sf->current, TYPE_OPERATOR, pos, 1, cs[pos]);
   366      return pos + 1;
   367  }
   368  
   369  static size_t parse_other(struct libinjection_sqli_state * sf)
   370  {
   371      const char *cs = sf->s;
   372      size_t pos = sf->pos;
   373  
   374      st_assign_char(sf->current, TYPE_UNKNOWN, pos, 1, cs[pos]);
   375      return pos + 1;
   376  }
   377  
   378  static size_t parse_char(struct libinjection_sqli_state * sf)
   379  {
   380      const char *cs = sf->s;
   381      size_t pos = sf->pos;
   382  
   383      st_assign_char(sf->current, cs[pos], pos, 1, cs[pos]);
   384      return pos + 1;
   385  }
   386  
   387  static size_t parse_eol_comment(struct libinjection_sqli_state * sf)
   388  {
   389      const char *cs = sf->s;
   390      const size_t slen = sf->slen;
   391      size_t pos = sf->pos;
   392  
   393      const char *endpos =
   394          (const char *) memchr((const void *) (cs + pos), '\n', slen - pos);
   395      if (endpos == NULL) {
   396          st_assign(sf->current, TYPE_COMMENT, pos, slen - pos, cs + pos);
   397          return slen;
   398      } else {
   399          st_assign(sf->current, TYPE_COMMENT, pos, (size_t)(endpos - cs) - pos, cs + pos);
   400          return (size_t)((endpos - cs) + 1);
   401      }
   402  }
   403  
   404  /** In ANSI mode, hash is an operator
   405   *  In MYSQL mode, it's a EOL comment like '--'
   406   */
   407  static size_t parse_hash(struct libinjection_sqli_state * sf)
   408  {
   409      sf->stats_comment_hash += 1;
   410      if (sf->flags & FLAG_SQL_MYSQL) {
   411          sf->stats_comment_hash += 1;
   412          return parse_eol_comment(sf);
   413      } else {
   414          st_assign_char(sf->current, TYPE_OPERATOR, sf->pos, 1, '#');
   415          return sf->pos + 1;
   416      }
   417  }
   418  
   419  static size_t parse_dash(struct libinjection_sqli_state * sf)
   420  {
   421      const char *cs = sf->s;
   422      const size_t slen = sf->slen;
   423      size_t pos = sf->pos;
   424  
   425      /*
   426       * five cases
   427       * 1) --[white]  this is always a SQL comment
   428       * 2) --[EOF]    this is a comment
   429       * 3) --[notwhite] in MySQL this is NOT a comment but two unary operators
   430       * 4) --[notwhite] everyone else thinks this is a comment
   431       * 5) -[not dash]  '-' is a unary operator
   432       */
   433  
   434      if (pos + 2 < slen && cs[pos + 1] == '-' && char_is_white(cs[pos+2]) ) {
   435          return parse_eol_comment(sf);
   436      } else if (pos +2 == slen && cs[pos + 1] == '-') {
   437          return parse_eol_comment(sf);
   438      } else if (pos + 1 < slen && cs[pos + 1] == '-' && (sf->flags & FLAG_SQL_ANSI)) {
   439          /* --[not-white] not-white case:
   440           *
   441           */
   442          sf->stats_comment_ddx += 1;
   443          return parse_eol_comment(sf);
   444      } else {
   445          st_assign_char(sf->current, TYPE_OPERATOR, pos, 1, '-');
   446          return pos + 1;
   447      }
   448  }
   449  
   450  
   451  /** This detects MySQL comments, comments that
   452   * start with /x!   We just ban these now but
   453   * previously we attempted to parse the inside
   454   *
   455   * For reference:
   456   * the form of /x![anything]x/ or /x!12345[anything] x/
   457   *
   458   * Mysql 3 (maybe 4), allowed this:
   459   *    /x!0selectx/ 1;
   460   * where 0 could be any number.
   461   *
   462   * The last version of MySQL 3 was in 2003.
   463  
   464   * It is unclear if the MySQL 3 syntax was allowed
   465   * in MySQL 4.  The last version of MySQL 4 was in 2008
   466   *
   467   */
   468  static size_t is_mysql_comment(const char *cs, const size_t len, size_t pos)
   469  {
   470      /* so far...
   471       * cs[pos] == '/' && cs[pos+1] == '*'
   472       */
   473  
   474      if (pos + 2 >= len) {
   475          /* not a mysql comment */
   476          return 0;
   477      }
   478  
   479      if (cs[pos + 2] != '!') {
   480          /* not a mysql comment */
   481          return 0;
   482      }
   483  
   484      /*
   485       * this is a mysql comment
   486       *  got "/x!"
   487       */
   488      return 1;
   489  }
   490  
   491  static size_t parse_slash(struct libinjection_sqli_state * sf)
   492  {
   493      const char* ptr;
   494      size_t clen;
   495      const char *cs = sf->s;
   496      const size_t slen = sf->slen;
   497      size_t pos = sf->pos;
   498      const char* cur = cs + pos;
   499      char ctype = TYPE_COMMENT;
   500      size_t pos1 = pos + 1;
   501      if (pos1 == slen || cs[pos1] != '*') {
   502          return parse_operator1(sf);
   503      }
   504  
   505      /*
   506       * skip over initial '/x'
   507       */
   508      ptr = memchr2(cur + 2, slen - (pos + 2), '*', '/');
   509      if (ptr == NULL) {
   510          /* till end of line */
   511          clen = slen - pos;
   512      } else {
   513          clen = (size_t)(ptr + 2 - cur);
   514      }
   515  
   516      /*
   517       * postgresql allows nested comments which makes
   518       * this is incompatible with parsing so
   519       * if we find a '/x' inside the coment, then
   520       * make a new token.
   521       *
   522       * Also, Mysql's "conditional" comments for version
   523       *  are an automatic black ban!
   524       */
   525  
   526      if (
   527          ptr != NULL &&
   528          memchr2(cur + 2, (size_t)(ptr - (cur + 1)), '/', '*') !=  NULL
   529      ) {
   530          ctype = TYPE_EVIL;
   531      } else if (is_mysql_comment(cs, slen, pos)) {
   532          ctype = TYPE_EVIL;
   533      }
   534  
   535      st_assign(sf->current, ctype, pos, clen, cs + pos);
   536      return pos + clen;
   537  }
   538  
   539  
   540  static size_t parse_backslash(struct libinjection_sqli_state * sf)
   541  {
   542      const char *cs = sf->s;
   543      const size_t slen = sf->slen;
   544      size_t pos = sf->pos;
   545  
   546      /*
   547       * Weird MySQL alias for NULL, "\N" (capital N only)
   548       */
   549      if (pos + 1 < slen && cs[pos +1] == 'N') {
   550          st_assign(sf->current, TYPE_NUMBER, pos, 2, cs + pos);
   551          return pos + 2;
   552      } else {
   553          st_assign_char(sf->current, TYPE_BACKSLASH, pos, 1, cs[pos]);
   554          return pos + 1;
   555      }
   556  }
   557  
   558  static size_t parse_operator2(struct libinjection_sqli_state * sf)
   559  {
   560      char ch;
   561      const char *cs = sf->s;
   562      const size_t slen = sf->slen;
   563      size_t pos = sf->pos;
   564  
   565      if (pos + 1 >= slen) {
   566          return parse_operator1(sf);
   567      }
   568  
   569      if (pos + 2 < slen &&
   570          cs[pos] == '<' &&
   571          cs[pos + 1] == '=' &&
   572          cs[pos + 2] == '>') {
   573          /*
   574           * special 3-char operator
   575           */
   576          st_assign(sf->current, TYPE_OPERATOR, pos, 3, cs + pos);
   577          return pos + 3;
   578      }
   579  
   580      ch = sf->lookup(sf, LOOKUP_OPERATOR, cs + pos, 2);
   581      if (ch != CHAR_NULL) {
   582          st_assign(sf->current, ch, pos, 2, cs+pos);
   583          return pos + 2;
   584      }
   585  
   586      /*
   587       * not an operator.. what to do with the two
   588       * characters we got?
   589       */
   590  
   591      if (cs[pos] == ':') {
   592          /* ':' is not an operator */
   593          st_assign(sf->current, TYPE_COLON, pos, 1, cs+pos);
   594          return pos + 1;
   595      } else {
   596          /*
   597           * must be a single char operator
   598           */
   599          return parse_operator1(sf);
   600      }
   601  }
   602  
   603  /*
   604   * Ok!   "  \"   "  one backslash = escaped!
   605   *       " \\"   "  two backslash = not escaped!
   606   *       "\\\"   "  three backslash = escaped!
   607   */
   608  #ifndef __clang_analyzer__
   609  static int is_backslash_escaped(const char* end, const char* start)
   610  {
   611      const char* ptr;
   612  /* Code not to be analyzed by clang.
   613   *
   614   * Why we do this? Because there is a false positive here:
   615   * libinjection_sqli.c:608:13: warning: Out of bound memory access (access exceeds upper limit of memory block) [alpha.security.ArrayBoundV2]
   616   *       if (*ptr != '\\') {
   617   *           ^~~~
   618   * Specifically, this function deals with non-null terminated char arrays. This can be added
   619   * as prerequisite, and is not written clearly. But the math in the for below holds.
   620   */
   621      for (ptr = end; ptr >= start; ptr--) {
   622          if (*ptr != '\\') {
   623              break;
   624          }
   625      }
   626      /* if number of backslashes is odd, it is escaped */
   627      return (end - ptr) & 1;
   628  }
   629  #endif
   630  
   631  static size_t is_double_delim_escaped(const char* cur,  const char* end)
   632  {
   633      return  ((cur + 1) < end) && *(cur+1) == *cur;
   634  }
   635  
   636  /* Look forward for doubling of delimiter
   637   *
   638   * case 'foo''bar' --> foo''bar
   639   *
   640   * ending quote isn't duplicated (i.e. escaped)
   641   * since it's the wrong char or EOL
   642   *
   643   */
   644  static size_t parse_string_core(const char *cs, const size_t len, size_t pos,
   645                                  stoken_t * st, char delim, size_t offset)
   646  {
   647      /*
   648       * offset is to skip the perhaps first quote char
   649       */
   650      const char *qpos =
   651          (const char *) memchr((const void *) (cs + pos + offset), delim,
   652                                len - pos - offset);
   653  
   654      /*
   655       * then keep string open/close info
   656       */
   657      if (offset > 0) {
   658          /*
   659           * this is real quote
   660           */
   661          st->str_open = delim;
   662      } else {
   663          /*
   664           * this was a simulated quote
   665           */
   666          st->str_open = CHAR_NULL;
   667      }
   668  
   669      while (TRUE) {
   670          if (qpos == NULL) {
   671              /*
   672               * string ended with no trailing quote
   673               * assign what we have
   674               */
   675              st_assign(st, TYPE_STRING, pos + offset, len - pos - offset, cs + pos + offset);
   676              st->str_close = CHAR_NULL;
   677              return len;
   678          } else if ( is_backslash_escaped(qpos - 1, cs + pos + offset)) {
   679              /* keep going, move ahead one character */
   680              qpos =
   681                  (const char *) memchr((const void *) (qpos + 1), delim,
   682                                        (size_t)((cs + len) - (qpos + 1)));
   683              continue;
   684          } else if (is_double_delim_escaped(qpos, cs + len)) {
   685              /* keep going, move ahead two characters */
   686              qpos =
   687                  (const char *) memchr((const void *) (qpos + 2), delim,
   688                                        (size_t)((cs + len) - (qpos + 2)));
   689              continue;
   690          } else {
   691              /* hey it's a normal string */
   692              st_assign(st, TYPE_STRING, pos + offset,
   693                        (size_t)(qpos - (cs + pos + offset)), cs + pos + offset);
   694              st->str_close = delim;
   695              return (size_t)(qpos - cs + 1);
   696          }
   697      }
   698  }
   699  
   700  /**
   701   * Used when first char is a ' or "
   702   */
   703  static size_t parse_string(struct libinjection_sqli_state * sf)
   704  {
   705      const char *cs = sf->s;
   706      const size_t slen = sf->slen;
   707      size_t pos = sf->pos;
   708  
   709      /*
   710       * assert cs[pos] == single or double quote
   711       */
   712      return parse_string_core(cs, slen, pos, sf->current, cs[pos], 1);
   713  }
   714  
   715  /**
   716   * Used when first char is:
   717   *    N or n:  mysql "National Character set"
   718   *    E     :  psql  "Escaped String"
   719   */
   720  static size_t parse_estring(struct libinjection_sqli_state * sf)
   721  {
   722      const char *cs = sf->s;
   723      const size_t slen = sf->slen;
   724      size_t pos = sf->pos;
   725  
   726      if (pos + 2 >= slen || cs[pos+1] != CHAR_SINGLE) {
   727          return parse_word(sf);
   728      }
   729      return parse_string_core(cs, slen, pos, sf->current, CHAR_SINGLE, 2);
   730  }
   731  
   732  static size_t parse_ustring(struct libinjection_sqli_state * sf)
   733  {
   734      const char *cs = sf->s;
   735      size_t slen = sf->slen;
   736      size_t pos = sf->pos;
   737  
   738      if (pos + 2 < slen && cs[pos+1] == '&' && cs[pos+2] == '\'') {
   739          sf->pos += 2;
   740          pos = parse_string(sf);
   741          sf->current->str_open = 'u';
   742          if (sf->current->str_close == '\'') {
   743              sf->current->str_close = 'u';
   744          }
   745          return pos;
   746      } else {
   747          return parse_word(sf);
   748      }
   749  }
   750  
   751  static size_t parse_qstring_core(struct libinjection_sqli_state * sf, size_t offset)
   752  {
   753      char ch;
   754      const char *strend;
   755      const char *cs = sf->s;
   756      size_t slen = sf->slen;
   757      size_t pos = sf->pos + offset;
   758  
   759      /* if we are already at end of string..
   760         if current char is not q or Q
   761         if we don't have 2 more chars
   762         if char2 != a single quote
   763         then, just treat as word
   764      */
   765      if (pos >= slen ||
   766          (cs[pos] != 'q' && cs[pos] != 'Q') ||
   767          pos + 2 >= slen ||
   768          cs[pos + 1] != '\'') {
   769          return parse_word(sf);
   770      }
   771  
   772      ch = cs[pos + 2];
   773  
   774      /* the ch > 127 is un-needed since
   775       * we assume char is signed
   776       */
   777      if (ch < 33 /* || ch > 127 */) {
   778          return parse_word(sf);
   779      }
   780      switch (ch) {
   781      case '(' : ch = ')'; break;
   782      case '[' : ch = ']'; break;
   783      case '{' : ch = '}'; break;
   784      case '<' : ch = '>'; break;
   785      }
   786  
   787      strend = memchr2(cs + pos + 3, slen - pos - 3, ch, '\'');
   788      if (strend == NULL) {
   789          st_assign(sf->current, TYPE_STRING, pos + 3, slen - pos - 3, cs + pos + 3);
   790          sf->current->str_open = 'q';
   791          sf->current->str_close = CHAR_NULL;
   792          return slen;
   793      } else {
   794          st_assign(sf->current, TYPE_STRING, pos + 3, (size_t)(strend - cs) - pos -  3, cs + pos + 3);
   795          sf->current->str_open = 'q';
   796          sf->current->str_close = 'q';
   797          return (size_t)(strend - cs + 2);
   798      }
   799  }
   800  
   801  /*
   802   * Oracle's q string
   803   */
   804  static size_t parse_qstring(struct libinjection_sqli_state * sf)
   805  {
   806      return parse_qstring_core(sf, 0);
   807  }
   808  
   809  /*
   810   * mysql's N'STRING' or
   811   * ...  Oracle's nq string
   812   */
   813  static size_t parse_nqstring(struct libinjection_sqli_state * sf)
   814  {
   815      size_t slen = sf->slen;
   816      size_t pos = sf->pos;
   817      if (pos + 2 < slen && sf->s[pos+1] == CHAR_SINGLE) {
   818          return parse_estring(sf);
   819      }
   820      return parse_qstring_core(sf, 1);
   821  }
   822  
   823  /*
   824   * binary literal string
   825   * re: [bB]'[01]*'
   826   */
   827  static size_t parse_bstring(struct libinjection_sqli_state *sf)
   828  {
   829      size_t wlen;
   830      const char *cs = sf->s;
   831      size_t pos = sf->pos;
   832      size_t slen = sf->slen;
   833  
   834      /* need at least 2 more characters
   835       * if next char isn't a single quote, then
   836       * continue as normal word
   837       */
   838      if (pos + 2 >= slen || cs[pos+1] !=  '\'') {
   839          return parse_word(sf);
   840      }
   841  
   842      wlen = strlenspn(cs + pos + 2, sf->slen - pos - 2, "01");
   843      if (pos + 2 + wlen  >= slen || cs[pos + 2 + wlen] != '\'') {
   844          return parse_word(sf);
   845      }
   846      st_assign(sf->current, TYPE_NUMBER, pos, wlen + 3, cs + pos);
   847      return pos + 2 + wlen + 1;
   848  }
   849  
   850  /*
   851   * hex literal string
   852   * re: [xX]'[0123456789abcdefABCDEF]*'
   853   * mysql has requirement of having EVEN number of chars,
   854   *  but pgsql does not
   855   */
   856  static size_t parse_xstring(struct libinjection_sqli_state *sf)
   857  {
   858      size_t wlen;
   859      const char *cs = sf->s;
   860      size_t pos = sf->pos;
   861      size_t slen = sf->slen;
   862  
   863      /* need at least 2 more characters
   864       * if next char isn't a single quote, then
   865       * continue as normal word
   866       */
   867      if (pos + 2 >= slen || cs[pos+1] !=  '\'') {
   868          return parse_word(sf);
   869      }
   870  
   871      wlen = strlenspn(cs + pos + 2, sf->slen - pos - 2, "0123456789ABCDEFabcdef");
   872      if (pos + 2 + wlen  >= slen || cs[pos + 2 + wlen] != '\'') {
   873          return parse_word(sf);
   874      }
   875      st_assign(sf->current, TYPE_NUMBER, pos, wlen + 3, cs + pos);
   876      return pos + 2 + wlen + 1;
   877  }
   878  
   879  /**
   880   * This handles MS SQLSERVER bracket words
   881   * http://stackoverflow.com/questions/3551284/sql-serverwhat-do-brackets-mean-around-column-name
   882   *
   883   */
   884  static size_t parse_bword(struct libinjection_sqli_state * sf)
   885  {
   886      const char *cs = sf->s;
   887      size_t pos = sf->pos;
   888      const char* endptr = (const char*) memchr(cs + pos, ']', sf->slen - pos);
   889      if (endptr == NULL) {
   890          st_assign(sf->current, TYPE_BAREWORD, pos, sf->slen - pos, cs + pos);
   891          return sf->slen;
   892      } else {
   893          st_assign(sf->current, TYPE_BAREWORD, pos, (size_t)(endptr - cs) - pos + 1, cs + pos);
   894          return (size_t)((endptr - cs) + 1);
   895      }
   896  }
   897  
   898  static size_t parse_word(struct libinjection_sqli_state * sf)
   899  {
   900      char ch;
   901      char delim;
   902      size_t i;
   903      const char *cs = sf->s;
   904      size_t pos = sf->pos;
   905      size_t wlen = strlencspn(cs + pos, sf->slen - pos,
   906                               " []{}<>:\\?=@!#~+-*/&|^%(),';\t\n\v\f\r\"\240\000");
   907  
   908      st_assign(sf->current, TYPE_BAREWORD, pos, wlen, cs + pos);
   909  
   910      /* now we need to look inside what we good for "." and "`"
   911       * and see if what is before is a keyword or not
   912       */
   913      for (i =0; i < sf->current->len; ++i) {
   914          delim = sf->current->val[i];
   915          if (delim == '.' || delim == '`') {
   916              ch = sf->lookup(sf, LOOKUP_WORD, sf->current->val, i);
   917              if (ch != TYPE_NONE && ch != TYPE_BAREWORD) {
   918                  /* needed for swig */
   919                  st_clear(sf->current);
   920                  /*
   921                   * we got something like "SELECT.1"
   922                   * or SELECT`column`
   923                   */
   924                  st_assign(sf->current, ch, pos, i, cs + pos);
   925                  return pos + i;
   926              }
   927          }
   928      }
   929  
   930      /*
   931       * do normal lookup with word including '.'
   932       */
   933      if (wlen < LIBINJECTION_SQLI_TOKEN_SIZE) {
   934  
   935          ch = sf->lookup(sf, LOOKUP_WORD, sf->current->val, wlen);
   936          if (ch == CHAR_NULL) {
   937              ch = TYPE_BAREWORD;
   938          }
   939          sf->current->type = ch;
   940      }
   941      return pos + wlen;
   942  }
   943  
   944  /* MySQL backticks are a cross between string and
   945   * and a bare word.
   946   *
   947   */
   948  static size_t parse_tick(struct libinjection_sqli_state* sf)
   949  {
   950      size_t pos =  parse_string_core(sf->s, sf->slen, sf->pos, sf->current, CHAR_TICK, 1);
   951  
   952      /* we could check to see if start and end of
   953       * of string are both "`", i.e. make sure we have
   954       * matching set.  `foo` vs. `foo
   955       * but I don't think it matters much
   956       */
   957  
   958      /* check value of string to see if it's a keyword,
   959       * function, operator, etc
   960       */
   961      char ch = sf->lookup(sf, LOOKUP_WORD, sf->current->val, sf->current->len);
   962      if (ch == TYPE_FUNCTION) {
   963          /* if it's a function, then convert token */
   964          sf->current->type = TYPE_FUNCTION;
   965      } else {
   966          /* otherwise it's a 'n' type -- mysql treats
   967           * everything as a bare word
   968           */
   969          sf->current->type = TYPE_BAREWORD;
   970      }
   971      return pos;
   972  }
   973  
   974  static size_t parse_var(struct libinjection_sqli_state * sf)
   975  {
   976      size_t xlen;
   977      const char *cs = sf->s;
   978      const size_t slen = sf->slen;
   979      size_t pos = sf->pos + 1;
   980  
   981      /*
   982       * var_count is only used to reconstruct
   983       * the input.  It counts the number of '@'
   984       * seen 0 in the case of NULL, 1 or 2
   985       */
   986  
   987      /*
   988       * move past optional other '@'
   989       */
   990      if (pos < slen && cs[pos] == '@') {
   991          pos += 1;
   992          sf->current->count = 2;
   993      } else {
   994          sf->current->count = 1;
   995      }
   996  
   997      /*
   998       * MySQL allows @@`version`
   999       */
  1000      if (pos < slen) {
  1001          if (cs[pos] == '`') {
  1002              sf->pos = pos;
  1003              pos = parse_tick(sf);
  1004              sf->current->type = TYPE_VARIABLE;
  1005              return pos;
  1006          } else if (cs[pos] == CHAR_SINGLE || cs[pos] == CHAR_DOUBLE) {
  1007              sf->pos = pos;
  1008              pos = parse_string(sf);
  1009              sf->current->type = TYPE_VARIABLE;
  1010              return pos;
  1011          }
  1012      }
  1013  
  1014  
  1015      xlen = strlencspn(cs + pos, slen - pos,
  1016                       " <>:\\?=@!#~+-*/&|^%(),';\t\n\v\f\r'`\"");
  1017      if (xlen == 0) {
  1018          st_assign(sf->current, TYPE_VARIABLE, pos, 0, cs + pos);
  1019          return pos;
  1020      } else {
  1021          st_assign(sf->current, TYPE_VARIABLE, pos, xlen, cs + pos);
  1022          return pos + xlen;
  1023      }
  1024  }
  1025  
  1026  static size_t parse_money(struct libinjection_sqli_state *sf)
  1027  {
  1028      size_t xlen;
  1029      const char* strend;
  1030      const char *cs = sf->s;
  1031      const size_t slen = sf->slen;
  1032      size_t pos = sf->pos;
  1033  
  1034      if (pos + 1 == slen) {
  1035          /* end of line */
  1036          st_assign_char(sf->current, TYPE_BAREWORD, pos, 1, '$');
  1037          return slen;
  1038      }
  1039  
  1040      /*
  1041       * $1,000.00 or $1.000,00 ok!
  1042       * This also parses $....,,,111 but that's ok
  1043       */
  1044  
  1045      xlen = strlenspn(cs + pos + 1, slen - pos - 1, "0123456789.,");
  1046      if (xlen == 0) {
  1047          if (cs[pos + 1] == '$') {
  1048              /* we have $$ .. find ending $$ and make string */
  1049              strend = memchr2(cs + pos + 2, slen - pos -2, '$', '$');
  1050              if (strend == NULL) {
  1051                  /* fell off edge */
  1052                  st_assign(sf->current, TYPE_STRING, pos + 2, slen - (pos + 2), cs + pos + 2);
  1053                  sf->current->str_open = '$';
  1054                  sf->current->str_close = CHAR_NULL;
  1055                  return slen;
  1056              } else {
  1057                  st_assign(sf->current, TYPE_STRING, pos + 2,
  1058                            (size_t)(strend - (cs + pos + 2)), cs + pos + 2);
  1059                  sf->current->str_open = '$';
  1060                  sf->current->str_close = '$';
  1061                  return (size_t)(strend - cs + 2);
  1062              }
  1063          } else {
  1064              /* ok it's not a number or '$$', but maybe it's pgsql "$ quoted strings" */
  1065              xlen = strlenspn(cs + pos + 1, slen - pos - 1, "abcdefghjiklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
  1066              if (xlen == 0) {
  1067                  /* hmm it's "$" _something_ .. just add $ and keep going*/
  1068                  st_assign_char(sf->current, TYPE_BAREWORD, pos, 1, '$');
  1069                  return pos + 1;
  1070              }
  1071              /* we have $foobar????? */
  1072              /* is it $foobar$ */
  1073              if (pos + xlen + 1 == slen || cs[pos+xlen+1] != '$') {
  1074                  /* not $foobar$, or fell off edge */
  1075                  st_assign_char(sf->current, TYPE_BAREWORD, pos, 1, '$');
  1076                  return pos + 1;
  1077              }
  1078  
  1079              /* we have $foobar$ ... find it again */
  1080              strend = my_memmem(cs+pos+xlen+2, slen - (pos+xlen+2), cs + pos, xlen+2);
  1081  
  1082              if (strend == NULL) {
  1083                  /* fell off edge */
  1084                  st_assign(sf->current, TYPE_STRING, pos+xlen+2, slen - pos - xlen - 2, cs+pos+xlen+2);
  1085                  sf->current->str_open = '$';
  1086                  sf->current->str_close = CHAR_NULL;
  1087                  return slen;
  1088              } else {
  1089                  /* got one */
  1090                  st_assign(sf->current, TYPE_STRING, pos+xlen+2,
  1091                            (size_t)(strend - (cs + pos + xlen + 2)), cs+pos+xlen+2);
  1092                  sf->current->str_open = '$';
  1093                  sf->current->str_close = '$';
  1094                  return (size_t)((strend + xlen + 2) - cs);
  1095              }
  1096          }
  1097      } else if (xlen == 1 && cs[pos + 1] == '.') {
  1098          /* $. should parsed as a word */
  1099          return parse_word(sf);
  1100      } else {
  1101          st_assign(sf->current, TYPE_NUMBER, pos, 1 + xlen, cs + pos);
  1102          return pos + 1 + xlen;
  1103      }
  1104  }
  1105  
  1106  static size_t parse_number(struct libinjection_sqli_state * sf)
  1107  {
  1108      size_t xlen;
  1109      size_t start;
  1110      const char* digits = NULL;
  1111      const char *cs = sf->s;
  1112      const size_t slen = sf->slen;
  1113      size_t pos = sf->pos;
  1114      int have_e = 0;
  1115      int have_exp = 0;
  1116  
  1117      /* cs[pos] == '0' has 1/10 chance of being true,
  1118       * while pos+1< slen is almost always true
  1119       */
  1120      if (cs[pos] == '0' && pos + 1 < slen) {
  1121          if (cs[pos + 1] == 'X' || cs[pos + 1] == 'x') {
  1122              digits = "0123456789ABCDEFabcdef";
  1123          } else if (cs[pos + 1] == 'B' || cs[pos + 1] == 'b') {
  1124              digits = "01";
  1125          }
  1126  
  1127          if (digits) {
  1128              xlen = strlenspn(cs + pos + 2, slen - pos - 2, digits);
  1129              if (xlen == 0) {
  1130                  st_assign(sf->current, TYPE_BAREWORD, pos, 2, cs + pos);
  1131                  return pos + 2;
  1132              } else {
  1133                  st_assign(sf->current, TYPE_NUMBER, pos, 2 + xlen, cs + pos);
  1134                  return pos + 2 + xlen;
  1135              }
  1136          }
  1137      }
  1138  
  1139      start = pos;
  1140      while (pos < slen && ISDIGIT(cs[pos])) {
  1141          pos += 1;
  1142      }
  1143  
  1144      if (pos < slen && cs[pos] == '.') {
  1145          pos += 1;
  1146          while (pos < slen && ISDIGIT(cs[pos])) {
  1147              pos += 1;
  1148          }
  1149          if (pos - start == 1) {
  1150              /* only one character read so far */
  1151              st_assign_char(sf->current, TYPE_DOT, start, 1, '.');
  1152              return pos;
  1153          }
  1154      }
  1155  
  1156      if (pos < slen) {
  1157          if (cs[pos] == 'E' || cs[pos] == 'e') {
  1158              have_e = 1;
  1159              pos += 1;
  1160              if (pos < slen && (cs[pos] == '+' || cs[pos] == '-')) {
  1161                  pos += 1;
  1162              }
  1163              while (pos < slen && ISDIGIT(cs[pos])) {
  1164                  have_exp = 1;
  1165                  pos += 1;
  1166              }
  1167          }
  1168      }
  1169  
  1170      /* oracle's ending float or double suffix
  1171       * http://docs.oracle.com/cd/B19306_01/server.102/b14200/sql_elements003.htm#i139891
  1172       */
  1173      if (pos < slen && (cs[pos] == 'd' || cs[pos] == 'D' || cs[pos] == 'f' || cs[pos] == 'F')) {
  1174          if (pos + 1 == slen) {
  1175              /* line ends evaluate "... 1.2f$" as '1.2f' */
  1176              pos += 1;
  1177          } else if ((char_is_white(cs[pos+1]) || cs[pos+1] == ';')) {
  1178              /*
  1179               * easy case, evaluate "... 1.2f ... as '1.2f'
  1180               */
  1181              pos += 1;
  1182          } else if (cs[pos+1] == 'u' || cs[pos+1] == 'U') {
  1183              /*
  1184               * a bit of a hack but makes '1fUNION' parse as '1f UNION'
  1185               */
  1186              pos += 1;
  1187          } else {
  1188              /* it's like "123FROM" */
  1189              /* parse as "123" only */
  1190          }
  1191      }
  1192  
  1193      if (have_e == 1 && have_exp == 0) {
  1194          /* very special form of
  1195           * "1234.e"
  1196           * "10.10E"
  1197           * ".E"
  1198           * this is a WORD not a number!! */
  1199          st_assign(sf->current, TYPE_BAREWORD, start, pos - start, cs + start);
  1200      } else {
  1201          st_assign(sf->current, TYPE_NUMBER, start, pos - start, cs + start);
  1202      }
  1203      return pos;
  1204  }
  1205  
  1206  /*
  1207   * API to return version.  This allows us to increment the version
  1208   * without having to regenerated the SWIG (or other binding) in minor
  1209   * releases.
  1210   */
  1211  const char* libinjection_version(void)
  1212  {
  1213      return LIBINJECTION_VERSION;
  1214  }
  1215  
  1216  int libinjection_sqli_tokenize(struct libinjection_sqli_state *sf)
  1217  {
  1218      pt2Function fnptr;
  1219      size_t *pos = &sf->pos;
  1220      stoken_t *current = sf->current;
  1221      const char *s = sf->s;
  1222      const size_t slen = sf->slen;
  1223  
  1224      if (slen == 0) {
  1225          return FALSE;
  1226      }
  1227  
  1228      st_clear(current);
  1229      sf->current = current;
  1230  
  1231      /*
  1232       * if we are at beginning of string
  1233       *  and in single-quote or double quote mode
  1234       *  then pretend the input starts with a quote
  1235       */
  1236      if (*pos == 0 && (sf->flags & (FLAG_QUOTE_SINGLE | FLAG_QUOTE_DOUBLE))) {
  1237          *pos = parse_string_core(s, slen, 0, current, flag2delim(sf->flags), 0);
  1238          sf->stats_tokens += 1;
  1239          return TRUE;
  1240      }
  1241  
  1242      while (*pos < slen) {
  1243  
  1244          /*
  1245           * get current character
  1246           */
  1247          const unsigned char ch = (unsigned char) (s[*pos]);
  1248  
  1249          /*
  1250           * look up the parser, and call it
  1251           *
  1252           * Porting Note: this is mapping of char to function
  1253           *   charparsers[ch]()
  1254           */
  1255          fnptr = char_parse_map[ch];
  1256  
  1257          *pos = (*fnptr) (sf);
  1258  
  1259          /*
  1260           *
  1261           */
  1262          if (current->type != CHAR_NULL) {
  1263              sf->stats_tokens += 1;
  1264              return TRUE;
  1265          }
  1266      }
  1267      return FALSE;
  1268  }
  1269  
  1270  void libinjection_sqli_init(struct libinjection_sqli_state * sf, const char *s, size_t len, int flags)
  1271  {
  1272      if (flags == 0) {
  1273          flags = FLAG_QUOTE_NONE | FLAG_SQL_ANSI;
  1274      }
  1275  
  1276      memset(sf, 0, sizeof(struct libinjection_sqli_state));
  1277      sf->s        = s;
  1278      sf->slen     = len;
  1279      sf->lookup   = libinjection_sqli_lookup_word;
  1280      sf->userdata = 0;
  1281      sf->flags    = flags;
  1282      sf->current  = &(sf->tokenvec[0]);
  1283  }
  1284  
  1285  void libinjection_sqli_reset(struct libinjection_sqli_state * sf, int flags)
  1286  {
  1287      void *userdata = sf->userdata;
  1288      ptr_lookup_fn lookup = sf->lookup;
  1289  
  1290      if (flags == 0) {
  1291          flags = FLAG_QUOTE_NONE | FLAG_SQL_ANSI;
  1292      }
  1293      libinjection_sqli_init(sf, sf->s, sf->slen, flags);
  1294      sf->lookup = lookup;
  1295      sf->userdata = userdata;
  1296  }
  1297  
  1298  void libinjection_sqli_callback(struct libinjection_sqli_state * sf, ptr_lookup_fn fn, void* userdata)
  1299  {
  1300      if (fn == NULL) {
  1301          sf->lookup = libinjection_sqli_lookup_word;
  1302          sf->userdata = (void*)(NULL);
  1303      } else {
  1304          sf->lookup = fn;
  1305          sf->userdata = userdata;
  1306      }
  1307  }
  1308  
  1309  /** See if two tokens can be merged since they are compound SQL phrases.
  1310   *
  1311   * This takes two tokens, and, if they are the right type,
  1312   * merges their values together.  Then checks to see if the
  1313   * new value is special using the PHRASES mapping.
  1314   *
  1315   * Example: "UNION" + "ALL" ==> "UNION ALL"
  1316   *
  1317   * C Security Notes: this is safe to use C-strings (null-terminated)
  1318   *  since the types involved by definition do not have embedded nulls
  1319   *  (e.g. there is no keyword with embedded null)
  1320   *
  1321   * Porting Notes: since this is C, it's oddly complicated.
  1322   *  This is just:  multikeywords[token.value + ' ' + token2.value]
  1323   *
  1324   */
  1325  static int syntax_merge_words(struct libinjection_sqli_state * sf,stoken_t * a, stoken_t * b)
  1326  {
  1327      size_t sz1;
  1328      size_t sz2;
  1329      size_t sz3;
  1330      char tmp[LIBINJECTION_SQLI_TOKEN_SIZE];
  1331      char ch;
  1332  
  1333      /* first token is of right type? */
  1334      if (!
  1335          (a->type == TYPE_KEYWORD ||
  1336           a->type == TYPE_BAREWORD ||
  1337           a->type == TYPE_OPERATOR ||
  1338           a->type == TYPE_UNION ||
  1339           a->type == TYPE_FUNCTION ||
  1340           a->type == TYPE_EXPRESSION ||
  1341           a->type == TYPE_TSQL ||
  1342           a->type == TYPE_SQLTYPE)) {
  1343          return FALSE;
  1344      }
  1345  
  1346      if (!
  1347          (b->type == TYPE_KEYWORD ||
  1348           b->type == TYPE_BAREWORD ||
  1349           b->type == TYPE_OPERATOR ||
  1350           b->type == TYPE_UNION ||
  1351           b->type == TYPE_FUNCTION ||
  1352           b->type == TYPE_EXPRESSION ||
  1353           b->type == TYPE_TSQL ||
  1354           b->type == TYPE_SQLTYPE ||
  1355           b->type == TYPE_LOGIC_OPERATOR)) {
  1356          return FALSE;
  1357      }
  1358  
  1359      sz1 = a->len;
  1360      sz2 = b->len;
  1361      sz3 = sz1 + sz2 + 1; /* +1 for space in the middle */
  1362      if (sz3 >= LIBINJECTION_SQLI_TOKEN_SIZE) { /* make sure there is room for ending null */
  1363          return FALSE;
  1364      }
  1365      /*
  1366       * oddly annoying  last.val + ' ' + current.val
  1367       */
  1368      memcpy(tmp, a->val, sz1);
  1369      tmp[sz1] = ' ';
  1370      memcpy(tmp + sz1 + 1, b->val, sz2);
  1371      tmp[sz3] = CHAR_NULL;
  1372      ch = sf->lookup(sf, LOOKUP_WORD, tmp, sz3);
  1373  
  1374      if (ch != CHAR_NULL) {
  1375          st_assign(a, ch, a->pos, sz3, tmp);
  1376          return TRUE;
  1377      } else {
  1378          return FALSE;
  1379      }
  1380  }
  1381  
  1382  int libinjection_sqli_fold(struct libinjection_sqli_state * sf)
  1383  {
  1384      stoken_t last_comment;
  1385  
  1386      /* POS is the position of where the NEXT token goes */
  1387      size_t pos = 0;
  1388  
  1389      /* LEFT is a count of how many tokens that are already
  1390         folded or processed (i.e. part of the fingerprint) */
  1391      size_t left =  0;
  1392  
  1393      int more = 1;
  1394  
  1395      st_clear(&last_comment);
  1396  
  1397      /* Skip all initial comments, right-parens ( and unary operators
  1398       *
  1399       */
  1400      sf->current = &(sf->tokenvec[0]);
  1401      while (more) {
  1402          more = libinjection_sqli_tokenize(sf);
  1403          if ( ! (sf->current->type == TYPE_COMMENT ||
  1404                  sf->current->type == TYPE_LEFTPARENS ||
  1405                  sf->current->type == TYPE_SQLTYPE ||
  1406                  st_is_unary_op(sf->current))) {
  1407              break;
  1408          }
  1409      }
  1410  
  1411      if (! more) {
  1412          /* If input was only comments, unary or (, then exit */
  1413          return 0;
  1414      } else {
  1415          /* it's some other token */
  1416          pos += 1;
  1417      }
  1418  
  1419      while (1) {
  1420          FOLD_DEBUG;
  1421  
  1422          /* do we have all the max number of tokens?  if so do
  1423           * some special cases for 5 tokens
  1424           */
  1425          if (pos >= LIBINJECTION_SQLI_MAX_TOKENS) {
  1426              if (
  1427                  (
  1428                      sf->tokenvec[0].type == TYPE_NUMBER &&
  1429                      (sf->tokenvec[1].type == TYPE_OPERATOR || sf->tokenvec[1].type == TYPE_COMMA) &&
  1430                      sf->tokenvec[2].type == TYPE_LEFTPARENS &&
  1431                      sf->tokenvec[3].type == TYPE_NUMBER &&
  1432                      sf->tokenvec[4].type == TYPE_RIGHTPARENS
  1433                      ) ||
  1434                  (
  1435                      sf->tokenvec[0].type == TYPE_BAREWORD &&
  1436                      sf->tokenvec[1].type == TYPE_OPERATOR &&
  1437                      sf->tokenvec[2].type == TYPE_LEFTPARENS &&
  1438                      (sf->tokenvec[3].type == TYPE_BAREWORD || sf->tokenvec[3].type == TYPE_NUMBER) &&
  1439                      sf->tokenvec[4].type == TYPE_RIGHTPARENS
  1440                      ) ||
  1441                  (
  1442                      sf->tokenvec[0].type == TYPE_NUMBER &&
  1443                      sf->tokenvec[1].type == TYPE_RIGHTPARENS &&
  1444                      sf->tokenvec[2].type == TYPE_COMMA &&
  1445                      sf->tokenvec[3].type == TYPE_LEFTPARENS &&
  1446                      sf->tokenvec[4].type == TYPE_NUMBER
  1447                      ) ||
  1448                  (
  1449                      sf->tokenvec[0].type == TYPE_BAREWORD &&
  1450                      sf->tokenvec[1].type == TYPE_RIGHTPARENS &&
  1451                      sf->tokenvec[2].type == TYPE_OPERATOR &&
  1452                      sf->tokenvec[3].type == TYPE_LEFTPARENS &&
  1453                      sf->tokenvec[4].type == TYPE_BAREWORD
  1454                      )
  1455                  )
  1456              {
  1457                  if (pos > LIBINJECTION_SQLI_MAX_TOKENS) {
  1458  		    st_copy(&(sf->tokenvec[1]), &(sf->tokenvec[LIBINJECTION_SQLI_MAX_TOKENS]));
  1459                      pos = 2;
  1460                      left = 0;
  1461                  } else {
  1462                      pos = 1;
  1463                      left = 0;
  1464                  }
  1465              }
  1466          }
  1467  
  1468          if (! more || left >= LIBINJECTION_SQLI_MAX_TOKENS) {
  1469              left = pos;
  1470              break;
  1471          }
  1472  
  1473          /* get up to two tokens */
  1474          while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && (pos - left) < 2) {
  1475              sf->current = &(sf->tokenvec[pos]);
  1476              more = libinjection_sqli_tokenize(sf);
  1477              if (more) {
  1478                  if (sf->current->type == TYPE_COMMENT) {
  1479                      st_copy(&last_comment, sf->current);
  1480                  } else {
  1481                      last_comment.type = CHAR_NULL;
  1482                      pos += 1;
  1483                  }
  1484              }
  1485          }
  1486          FOLD_DEBUG;
  1487          /* did we get 2 tokens? if not then we are done */
  1488          if (pos - left < 2) {
  1489              left = pos;
  1490              continue;
  1491          }
  1492  
  1493          /* FOLD: "ss" -> "s"
  1494           * "foo" "bar" is valid SQL
  1495           * just ignore second string
  1496           */
  1497          if (sf->tokenvec[left].type == TYPE_STRING && sf->tokenvec[left+1].type == TYPE_STRING) {
  1498              pos -= 1;
  1499              sf->stats_folds += 1;
  1500              continue;
  1501          } else if (sf->tokenvec[left].type == TYPE_SEMICOLON && sf->tokenvec[left+1].type == TYPE_SEMICOLON) {
  1502              /* not sure how various engines handle
  1503               * 'select 1;;drop table foo' or
  1504               * 'select 1; /x foo x/; drop table foo'
  1505               * to prevent surprises, just fold away repeated semicolons
  1506               */
  1507              pos -= 1;
  1508              sf->stats_folds += 1;
  1509              continue;
  1510          } else if ((sf->tokenvec[left].type == TYPE_OPERATOR ||
  1511                      sf->tokenvec[left].type == TYPE_LOGIC_OPERATOR) &&
  1512                     (st_is_unary_op(&sf->tokenvec[left+1]) ||
  1513                      sf->tokenvec[left+1].type == TYPE_SQLTYPE)) {
  1514              pos -= 1;
  1515              sf->stats_folds += 1;
  1516              left = 0;
  1517              continue;
  1518          } else if (sf->tokenvec[left].type == TYPE_LEFTPARENS &&
  1519                     st_is_unary_op(&sf->tokenvec[left+1])) {
  1520              pos -= 1;
  1521              sf->stats_folds += 1;
  1522              if (left > 0) {
  1523                  left -= 1;
  1524              }
  1525              continue;
  1526          } else if (syntax_merge_words(sf, &sf->tokenvec[left], &sf->tokenvec[left+1])) {
  1527              pos -= 1;
  1528              sf->stats_folds += 1;
  1529              if (left > 0) {
  1530                  left -= 1;
  1531              }
  1532              continue;
  1533          } else if (sf->tokenvec[left].type == TYPE_SEMICOLON &&
  1534                     sf->tokenvec[left+1].type == TYPE_FUNCTION &&
  1535  		   (sf->tokenvec[left+1].val[0] == 'I' ||
  1536  		    sf->tokenvec[left+1].val[0] == 'i' ) &&
  1537  		   (sf->tokenvec[left+1].val[1] == 'F' ||
  1538                      sf->tokenvec[left+1].val[1] == 'f' )) {
  1539              /* IF is normally a function, except in Transact-SQL where it can be used as a
  1540               * standalone control flow operator, e.g. ; IF 1=1 ...
  1541               * if found after a semicolon, convert from 'f' type to 'T' type
  1542               */
  1543              sf->tokenvec[left+1].type = TYPE_TSQL;
  1544              /* left += 2; */
  1545              continue; /* reparse everything, but we probably can advance left, and pos */
  1546          } else if ((sf->tokenvec[left].type == TYPE_BAREWORD || sf->tokenvec[left].type == TYPE_VARIABLE) &&
  1547                     sf->tokenvec[left+1].type == TYPE_LEFTPARENS && (
  1548                         /* TSQL functions but common enough to be column names */
  1549                         cstrcasecmp("USER_ID", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1550                         cstrcasecmp("USER_NAME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1551  
  1552                         /* Function in MYSQL */
  1553                         cstrcasecmp("DATABASE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1554                         cstrcasecmp("PASSWORD", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1555                         cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1556  
  1557                         /* Mysql words that act as a variable and are a function */
  1558  
  1559                         /* TSQL current_users is fake-variable */
  1560                         /* http://msdn.microsoft.com/en-us/library/ms176050.aspx */
  1561                         cstrcasecmp("CURRENT_USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1562                         cstrcasecmp("CURRENT_DATE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1563                         cstrcasecmp("CURRENT_TIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1564                         cstrcasecmp("CURRENT_TIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1565                         cstrcasecmp("LOCALTIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1566                         cstrcasecmp("LOCALTIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
  1567                         )) {
  1568  
  1569              /* pos is the same
  1570               * other conversions need to go here... for instance
  1571               * password CAN be a function, coalesce CAN be a function
  1572               */
  1573              sf->tokenvec[left].type = TYPE_FUNCTION;
  1574              continue;
  1575          } else if (sf->tokenvec[left].type == TYPE_KEYWORD && (
  1576                         cstrcasecmp("IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1577                         cstrcasecmp("NOT IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
  1578                         )) {
  1579  
  1580              if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
  1581                  /* got .... IN ( ...  (or 'NOT IN')
  1582                   * it's an operator
  1583                   */
  1584                  sf->tokenvec[left].type = TYPE_OPERATOR;
  1585              } else {
  1586                  /*
  1587                   * it's a nothing
  1588                   */
  1589                  sf->tokenvec[left].type = TYPE_BAREWORD;
  1590              }
  1591  
  1592              /* "IN" can be used as "IN BOOLEAN MODE" for mysql
  1593               *  in which case merging of words can be done later
  1594               * other wise it acts as an equality operator __ IN (values..)
  1595               *
  1596               * here we got "IN" "(" so it's an operator.
  1597               * also back track to handle "NOT IN"
  1598               * might need to do the same with like
  1599               * two use cases   "foo" LIKE "BAR" (normal operator)
  1600               *  "foo" = LIKE(1,2)
  1601               */
  1602              continue;
  1603          } else if ((sf->tokenvec[left].type == TYPE_OPERATOR) && (
  1604                         cstrcasecmp("LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
  1605                         cstrcasecmp("NOT LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0)) {
  1606              if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
  1607                  /* SELECT LIKE(...
  1608                   * it's a function
  1609                   */
  1610                  sf->tokenvec[left].type = TYPE_FUNCTION;
  1611              }
  1612          } else if (sf->tokenvec[left].type == TYPE_SQLTYPE &&
  1613                     (sf->tokenvec[left+1].type == TYPE_BAREWORD ||
  1614                      sf->tokenvec[left+1].type == TYPE_NUMBER ||
  1615                      sf->tokenvec[left+1].type == TYPE_SQLTYPE ||
  1616                      sf->tokenvec[left+1].type == TYPE_LEFTPARENS ||
  1617                      sf->tokenvec[left+1].type == TYPE_FUNCTION ||
  1618                      sf->tokenvec[left+1].type == TYPE_VARIABLE ||
  1619                      sf->tokenvec[left+1].type == TYPE_STRING))  {
  1620              st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
  1621              pos -= 1;
  1622              sf->stats_folds += 1;
  1623              left = 0;
  1624              continue;
  1625          } else if (sf->tokenvec[left].type == TYPE_COLLATE &&
  1626                     sf->tokenvec[left+1].type == TYPE_BAREWORD) {
  1627              /*
  1628               * there are too many collation types.. so if the bareword has a "_"
  1629               * then it's TYPE_SQLTYPE
  1630               */
  1631              if (strchr(sf->tokenvec[left+1].val, '_') != NULL) {
  1632                  sf->tokenvec[left+1].type = TYPE_SQLTYPE;
  1633                  left = 0;
  1634              }
  1635          } else if (sf->tokenvec[left].type == TYPE_BACKSLASH) {
  1636              if (st_is_arithmetic_op(&(sf->tokenvec[left+1]))) {
  1637                  /* very weird case in TSQL where '\%1' is parsed as '0 % 1', etc */
  1638                  sf->tokenvec[left].type = TYPE_NUMBER;
  1639              } else {
  1640                  /* just ignore it.. Again T-SQL seems to parse \1 as "1" */
  1641                  st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
  1642                  pos -= 1;
  1643                  sf->stats_folds += 1;
  1644              }
  1645              left = 0;
  1646              continue;
  1647          } else if (sf->tokenvec[left].type == TYPE_LEFTPARENS &&
  1648                     sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
  1649              pos -= 1;
  1650              left = 0;
  1651              sf->stats_folds += 1;
  1652              continue;
  1653          } else if (sf->tokenvec[left].type == TYPE_RIGHTPARENS &&
  1654                     sf->tokenvec[left+1].type == TYPE_RIGHTPARENS) {
  1655              pos -= 1;
  1656              left = 0;
  1657              sf->stats_folds += 1;
  1658              continue;
  1659          } else if (sf->tokenvec[left].type == TYPE_LEFTBRACE &&
  1660                     sf->tokenvec[left+1].type == TYPE_BAREWORD) {
  1661  
  1662              /*
  1663               * MySQL Degenerate case --
  1664               *
  1665               *   select { ``.``.id };  -- valid !!!
  1666               *   select { ``.``.``.id };  -- invalid
  1667               *   select ``.``.id; -- invalid
  1668               *   select { ``.id }; -- invalid
  1669               *
  1670               * so it appears {``.``.id} is a magic case
  1671               * I suspect this is "current database, current table, field id"
  1672               *
  1673               * The folding code can't look at more than 3 tokens, and
  1674               * I don't want to make two passes.
  1675               *
  1676               * Since "{ ``" so rare, we are just going to blacklist it.
  1677               *
  1678               * Highly likely this will need revisiting!
  1679               *
  1680               * CREDIT @rsalgado 2013-11-25
  1681               */
  1682              if (sf->tokenvec[left+1].len == 0) {
  1683                  sf->tokenvec[left+1].type = TYPE_EVIL;
  1684                  return (int)(left+2);
  1685              }
  1686              /* weird ODBC / MYSQL  {foo expr} --> expr
  1687               * but for this rule we just strip away the "{ foo" part
  1688               */
  1689              left = 0;
  1690              pos -= 2;
  1691              sf->stats_folds += 2;
  1692              continue;
  1693          } else if (sf->tokenvec[left+1].type == TYPE_RIGHTBRACE) {
  1694              pos -= 1;
  1695              left = 0;
  1696              sf->stats_folds += 1;
  1697              continue;
  1698          }
  1699  
  1700          /* all cases of handing 2 tokens is done
  1701             and nothing matched.  Get one more token
  1702          */
  1703          FOLD_DEBUG;
  1704          while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && pos - left < 3) {
  1705              sf->current = &(sf->tokenvec[pos]);
  1706              more = libinjection_sqli_tokenize(sf);
  1707              if (more) {
  1708                  if (sf->current->type == TYPE_COMMENT) {
  1709                      st_copy(&last_comment, sf->current);
  1710                  } else {
  1711                      last_comment.type = CHAR_NULL;
  1712                      pos += 1;
  1713                  }
  1714              }
  1715          }
  1716  
  1717          /* do we have three tokens? If not then we are done */
  1718          if (pos -left < 3) {
  1719              left = pos;
  1720              continue;
  1721          }
  1722  
  1723          /*
  1724           * now look for three token folding
  1725           */
  1726          if (sf->tokenvec[left].type == TYPE_NUMBER &&
  1727              sf->tokenvec[left+1].type == TYPE_OPERATOR &&
  1728              sf->tokenvec[left+2].type == TYPE_NUMBER) {
  1729              pos -= 2;
  1730              left = 0;
  1731              continue;
  1732          } else if (sf->tokenvec[left].type == TYPE_OPERATOR &&
  1733                     sf->tokenvec[left+1].type != TYPE_LEFTPARENS &&
  1734                     sf->tokenvec[left+2].type == TYPE_OPERATOR) {
  1735              left = 0;
  1736              pos -= 2;
  1737              continue;
  1738          } else if (sf->tokenvec[left].type == TYPE_LOGIC_OPERATOR &&
  1739                     sf->tokenvec[left+2].type == TYPE_LOGIC_OPERATOR) {
  1740              pos -= 2;
  1741              left = 0;
  1742              continue;
  1743          } else if (sf->tokenvec[left].type == TYPE_VARIABLE &&
  1744                     sf->tokenvec[left+1].type == TYPE_OPERATOR &&
  1745                     (sf->tokenvec[left+2].type == TYPE_VARIABLE ||
  1746                      sf->tokenvec[left+2].type == TYPE_NUMBER ||
  1747                      sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
  1748              pos -= 2;
  1749              left = 0;
  1750              continue;
  1751          } else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
  1752                      sf->tokenvec[left].type == TYPE_NUMBER ) &&
  1753                     sf->tokenvec[left+1].type == TYPE_OPERATOR &&
  1754                     (sf->tokenvec[left+2].type == TYPE_NUMBER ||
  1755                      sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
  1756              pos -= 2;
  1757              left = 0;
  1758              continue;
  1759          } else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
  1760                      sf->tokenvec[left].type == TYPE_NUMBER ||
  1761                      sf->tokenvec[left].type == TYPE_VARIABLE ||
  1762                      sf->tokenvec[left].type == TYPE_STRING) &&
  1763                     sf->tokenvec[left+1].type == TYPE_OPERATOR &&
  1764                     streq(sf->tokenvec[left+1].val, "::") &&
  1765                     sf->tokenvec[left+2].type == TYPE_SQLTYPE) {
  1766              pos -= 2;
  1767              left = 0;
  1768              sf->stats_folds += 2;
  1769              continue;
  1770          } else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
  1771                      sf->tokenvec[left].type == TYPE_NUMBER ||
  1772                      sf->tokenvec[left].type == TYPE_STRING ||
  1773                      sf->tokenvec[left].type == TYPE_VARIABLE) &&
  1774                     sf->tokenvec[left+1].type == TYPE_COMMA &&
  1775                     (sf->tokenvec[left+2].type == TYPE_NUMBER ||
  1776                      sf->tokenvec[left+2].type == TYPE_BAREWORD ||
  1777                      sf->tokenvec[left+2].type == TYPE_STRING ||
  1778                      sf->tokenvec[left+2].type == TYPE_VARIABLE)) {
  1779              pos -= 2;
  1780              left = 0;
  1781              continue;
  1782          } else if ((sf->tokenvec[left].type == TYPE_EXPRESSION ||
  1783                      sf->tokenvec[left].type == TYPE_GROUP ||
  1784                      sf->tokenvec[left].type == TYPE_COMMA) &&
  1785                     st_is_unary_op(&sf->tokenvec[left+1]) &&
  1786                     sf->tokenvec[left+2].type == TYPE_LEFTPARENS) {
  1787              /* got something like SELECT + (, LIMIT + (
  1788               * remove unary operator
  1789               */
  1790              st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
  1791              pos -= 1;
  1792              left = 0;
  1793              continue;
  1794          } else if ((sf->tokenvec[left].type == TYPE_KEYWORD ||
  1795                      sf->tokenvec[left].type == TYPE_EXPRESSION ||
  1796                      sf->tokenvec[left].type == TYPE_GROUP )  &&
  1797                     st_is_unary_op(&sf->tokenvec[left+1]) &&
  1798                     (sf->tokenvec[left+2].type == TYPE_NUMBER ||
  1799                      sf->tokenvec[left+2].type == TYPE_BAREWORD ||
  1800                      sf->tokenvec[left+2].type == TYPE_VARIABLE ||
  1801                      sf->tokenvec[left+2].type == TYPE_STRING ||
  1802                      sf->tokenvec[left+2].type == TYPE_FUNCTION )) {
  1803              /* remove unary operators
  1804               * select - 1
  1805               */
  1806              st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
  1807              pos -= 1;
  1808              left = 0;
  1809              continue;
  1810          } else if (sf->tokenvec[left].type == TYPE_COMMA &&
  1811                     st_is_unary_op(&sf->tokenvec[left+1]) &&
  1812                     (sf->tokenvec[left+2].type == TYPE_NUMBER ||
  1813                      sf->tokenvec[left+2].type == TYPE_BAREWORD ||
  1814                      sf->tokenvec[left+2].type == TYPE_VARIABLE ||
  1815                      sf->tokenvec[left+2].type == TYPE_STRING)) {
  1816              /*
  1817               * interesting case    turn ", -1"  ->> ",1" PLUS we need to back up
  1818               * one token if possible to see if more folding can be done
  1819               * "1,-1" --> "1"
  1820               */
  1821              st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
  1822              left = 0;
  1823              /* pos is >= 3 so this is safe */
  1824              assert(pos >= 3);
  1825              pos -= 3;
  1826              continue;
  1827          } else if (sf->tokenvec[left].type == TYPE_COMMA &&
  1828                     st_is_unary_op(&sf->tokenvec[left+1]) &&
  1829                     sf->tokenvec[left+2].type == TYPE_FUNCTION) {
  1830  
  1831              /* Separate case from above since you end up with
  1832               * 1,-sin(1) --> 1 (1)
  1833               * Here, just do
  1834               * 1,-sin(1) --> 1,sin(1)
  1835               * just remove unary operator
  1836               */
  1837              st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
  1838              pos -= 1;
  1839              left = 0;
  1840              continue;
  1841          } else if ((sf->tokenvec[left].type == TYPE_BAREWORD) &&
  1842                     (sf->tokenvec[left+1].type == TYPE_DOT) &&
  1843                     (sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
  1844              /* ignore the '.n'
  1845               * typically is this databasename.table
  1846               */
  1847              assert(pos >= 3);
  1848              pos -= 2;
  1849              left = 0;
  1850              continue;
  1851          } else if ((sf->tokenvec[left].type == TYPE_EXPRESSION) &&
  1852                     (sf->tokenvec[left+1].type == TYPE_DOT) &&
  1853                     (sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
  1854              /* select . `foo` --> select `foo` */
  1855              st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
  1856              pos -= 1;
  1857              left = 0;
  1858              continue;
  1859          } else if ((sf->tokenvec[left].type == TYPE_FUNCTION) &&
  1860                     (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) &&
  1861                     (sf->tokenvec[left+2].type != TYPE_RIGHTPARENS)) {
  1862              /*
  1863               * whats going on here
  1864               * Some SQL functions like USER() have 0 args
  1865               * if we get User(foo), then User is not a function
  1866               * This should be expanded since it eliminated a lot of false
  1867               * positives. 
  1868               */
  1869              if  (cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0) {
  1870                  sf->tokenvec[left].type = TYPE_BAREWORD;
  1871              }
  1872          }
  1873  
  1874          /* no folding -- assume left-most token is
  1875             is good, now use the existing 2 tokens --
  1876             do not get another
  1877          */
  1878  
  1879          left += 1;
  1880  
  1881      } /* while(1) */
  1882  
  1883      /* if we have 4 or less tokens, and we had a comment token
  1884       * at the end, add it back
  1885       */
  1886  
  1887      if (left < LIBINJECTION_SQLI_MAX_TOKENS && last_comment.type == TYPE_COMMENT) {
  1888          st_copy(&sf->tokenvec[left], &last_comment);
  1889          left += 1;
  1890      }
  1891  
  1892      /* sometimes we grab a 6th token to help
  1893         determine the type of token 5.
  1894      */
  1895      if (left > LIBINJECTION_SQLI_MAX_TOKENS) {
  1896          left = LIBINJECTION_SQLI_MAX_TOKENS;
  1897      }
  1898  
  1899      return (int)left;
  1900  }
  1901  
  1902  /* secondary api: detects SQLi in a string, GIVEN a context.
  1903   *
  1904   * A context can be:
  1905   *   *  CHAR_NULL (\0), process as is
  1906   *   *  CHAR_SINGLE ('), process pretending input started with a
  1907   *          single quote.
  1908   *   *  CHAR_DOUBLE ("), process pretending input started with a
  1909   *          double quote.
  1910   *
  1911   */
  1912  const char* libinjection_sqli_fingerprint(struct libinjection_sqli_state * sql_state, int flags)
  1913  {
  1914      int i;
  1915      int tlen = 0;
  1916  
  1917      libinjection_sqli_reset(sql_state, flags);
  1918  
  1919      tlen = libinjection_sqli_fold(sql_state);
  1920  
  1921      /* Check for magic PHP backquote comment
  1922       * If:
  1923       * * last token is of type "bareword"
  1924       * * And is quoted in a backtick
  1925       * * And isn't closed
  1926       * * And it's empty?
  1927       * Then convert it to comment
  1928       */
  1929      if (tlen > 2 &&
  1930          sql_state->tokenvec[tlen-1].type == TYPE_BAREWORD &&
  1931          sql_state->tokenvec[tlen-1].str_open == CHAR_TICK &&
  1932          sql_state->tokenvec[tlen-1].len == 0 &&
  1933          sql_state->tokenvec[tlen-1].str_close == CHAR_NULL) {
  1934          sql_state->tokenvec[tlen-1].type = TYPE_COMMENT;
  1935      }
  1936  
  1937      for (i = 0; i < tlen; ++i) {
  1938          sql_state->fingerprint[i] = sql_state->tokenvec[i].type;
  1939      }
  1940  
  1941      /*
  1942       * make the fingerprint pattern a c-string (null delimited)
  1943       */
  1944      sql_state->fingerprint[tlen] = CHAR_NULL;
  1945  
  1946      /*
  1947       * check for 'X' in pattern, and then
  1948       * clear out all tokens
  1949       *
  1950       * this means parsing could not be done
  1951       * accurately due to pgsql's double comments
  1952       * or other syntax that isn't consistent.
  1953       * Should be very rare false positive
  1954       */
  1955      if (strchr(sql_state->fingerprint, TYPE_EVIL)) {
  1956          /*  needed for SWIG */
  1957          memset((void*)sql_state->fingerprint, 0, LIBINJECTION_SQLI_MAX_TOKENS + 1);
  1958          memset((void*)sql_state->tokenvec[0].val, 0, LIBINJECTION_SQLI_TOKEN_SIZE);
  1959  
  1960          sql_state->fingerprint[0] = TYPE_EVIL;
  1961  
  1962          sql_state->tokenvec[0].type = TYPE_EVIL;
  1963          sql_state->tokenvec[0].val[0] = TYPE_EVIL;
  1964          sql_state->tokenvec[1].type = CHAR_NULL;
  1965      }
  1966  
  1967  
  1968      return sql_state->fingerprint;
  1969  }
  1970  
  1971  int libinjection_sqli_check_fingerprint(struct libinjection_sqli_state* sql_state)
  1972  {
  1973      return libinjection_sqli_blacklist(sql_state) &&
  1974          libinjection_sqli_not_whitelist(sql_state);
  1975  }
  1976  
  1977  char libinjection_sqli_lookup_word(struct libinjection_sqli_state *sql_state, int lookup_type,
  1978                                     const char* str, size_t len)
  1979  {
  1980      if (lookup_type == LOOKUP_FINGERPRINT) {
  1981          return libinjection_sqli_check_fingerprint(sql_state) ? 'X' : '\0';
  1982      } else {
  1983          return bsearch_keyword_type(str, len, sql_keywords, sql_keywords_sz);
  1984      }
  1985  }
  1986  
  1987  int libinjection_sqli_blacklist(struct libinjection_sqli_state* sql_state)
  1988  {
  1989      /*
  1990       * use minimum of 8 bytes to make sure gcc -fstack-protector
  1991       * works correctly
  1992       */
  1993      char fp2[8];
  1994      char ch;
  1995      size_t i;
  1996      size_t len = strlen(sql_state->fingerprint);
  1997      int patmatch;
  1998  
  1999      if (len < 1) {
  2000          sql_state->reason = __LINE__;
  2001          return FALSE;
  2002      }
  2003  
  2004      /*
  2005        to keep everything compatible, convert the
  2006        v0 fingerprint pattern to v1
  2007        v0: up to 5 chars, mixed case
  2008        v1: 1 char is '0', up to 5 more chars, upper case
  2009      */
  2010  
  2011      fp2[0] = '0';
  2012      for (i = 0; i < len; ++i) {
  2013          ch = sql_state->fingerprint[i];
  2014          if (ch >= 'a' && ch <= 'z') {
  2015              ch -= 0x20;
  2016          }
  2017          fp2[i+1] = ch;
  2018      }
  2019      fp2[i+1] = '\0';
  2020  
  2021      patmatch = is_keyword(fp2, len + 1) == TYPE_FINGERPRINT;
  2022  
  2023      /*
  2024       * No match.
  2025       *
  2026       * Set sql_state->reason to current line number
  2027       * only for debugging purposes.
  2028       */
  2029      if (!patmatch) {
  2030          sql_state->reason = __LINE__;
  2031          return FALSE;
  2032      }
  2033  
  2034      return TRUE;
  2035  }
  2036  
  2037  /*
  2038   * return TRUE if SQLi, false is benign
  2039   */
  2040  int libinjection_sqli_not_whitelist(struct libinjection_sqli_state* sql_state)
  2041  {
  2042      /*
  2043       * We assume we got a SQLi match
  2044       * This next part just helps reduce false positives.
  2045       *
  2046       */
  2047      char ch;
  2048      size_t tlen = strlen(sql_state->fingerprint);
  2049  
  2050      if (tlen > 1 && sql_state->fingerprint[tlen-1] == TYPE_COMMENT) {
  2051          /*
  2052           * if ending comment is contains 'sp_password' then it's SQLi!
  2053           * MS Audit log apparently ignores anything with
  2054           * 'sp_password' in it. Unable to find primary reference to
  2055           * this "feature" of SQL Server but seems to be known SQLi
  2056           * technique
  2057           */
  2058          if (my_memmem(sql_state->s, sql_state->slen,
  2059                        "sp_password", strlen("sp_password"))) {
  2060              sql_state->reason = __LINE__;
  2061              return TRUE;
  2062          }
  2063      }
  2064  
  2065      switch (tlen) {
  2066      case 2:{
  2067          /*
  2068           * case 2 are "very small SQLi" which make them
  2069           * hard to tell from normal input...
  2070           */
  2071  
  2072          if (sql_state->fingerprint[1] == TYPE_UNION) {
  2073              if (sql_state->stats_tokens == 2) {
  2074                  /* not sure why but 1U comes up in SQLi attack
  2075                   * likely part of parameter splitting/etc.
  2076                   * lots of reasons why "1 union" might be normal
  2077                   * input, so beep only if other SQLi things are present
  2078                   */
  2079                  /* it really is a number and 'union'
  2080                   * other wise it has folding or comments
  2081                   */
  2082                  sql_state->reason = __LINE__;
  2083                  return FALSE;
  2084              } else {
  2085                  sql_state->reason = __LINE__;
  2086                  return TRUE;
  2087              }
  2088          }
  2089          /*
  2090           * if 'comment' is '#' ignore.. too many FP
  2091           */
  2092          if (sql_state->tokenvec[1].val[0] == '#') {
  2093              sql_state->reason = __LINE__;
  2094              return FALSE;
  2095          }
  2096  
  2097          /*
  2098           * for fingerprint like 'nc', only comments of /x are treated
  2099           * as SQL... ending comments of "--" and "#" are not SQLi
  2100           */
  2101          if (sql_state->tokenvec[0].type == TYPE_BAREWORD &&
  2102              sql_state->tokenvec[1].type == TYPE_COMMENT &&
  2103              sql_state->tokenvec[1].val[0] != '/') {
  2104                  sql_state->reason = __LINE__;
  2105                  return FALSE;
  2106          }
  2107  
  2108          /*
  2109           * if '1c' ends with '/x' then it's SQLi
  2110           */
  2111          if (sql_state->tokenvec[0].type == TYPE_NUMBER &&
  2112              sql_state->tokenvec[1].type == TYPE_COMMENT &&
  2113              sql_state->tokenvec[1].val[0] == '/') {
  2114              return TRUE;
  2115          }
  2116  
  2117          /**
  2118           * there are some odd base64-looking query string values
  2119           * 1234-ABCDEFEhfhihwuefi--
  2120           * which evaluate to "1c"... these are not SQLi
  2121           * but 1234-- probably is.
  2122           * Make sure the "1" in "1c" is actually a true decimal number
  2123           *
  2124           * Need to check -original- string since the folding step
  2125           * may have merged tokens, e.g. "1+FOO" is folded into "1"
  2126           *
  2127           * Note: evasion: 1*1--
  2128           */
  2129          if (sql_state->tokenvec[0].type == TYPE_NUMBER &&
  2130              sql_state->tokenvec[1].type == TYPE_COMMENT) {
  2131              if (sql_state->stats_tokens > 2) {
  2132                  /* we have some folding going on, highly likely SQLi */
  2133                  sql_state->reason = __LINE__;
  2134                  return TRUE;
  2135              }
  2136              /*
  2137               * we check that next character after the number is either whitespace,
  2138               * or '/' or a '-' ==> SQLi.
  2139               */
  2140              ch = sql_state->s[sql_state->tokenvec[0].len];
  2141              if ( ch <= 32 ) {
  2142                  /* next char was whitespace,e.g. "1234 --"
  2143                   * this isn't exactly correct.. ideally we should skip over all whitespace
  2144                   * but this seems to be ok for now
  2145                   */
  2146                  return TRUE;
  2147              }
  2148              if (ch == '/' && sql_state->s[sql_state->tokenvec[0].len + 1] == '*') {
  2149                  return TRUE;
  2150              }
  2151              if (ch == '-' && sql_state->s[sql_state->tokenvec[0].len + 1] == '-') {
  2152                  return TRUE;
  2153              }
  2154  
  2155              sql_state->reason = __LINE__;
  2156              return FALSE;
  2157          }
  2158  
  2159          /*
  2160           * detect obvious SQLi scans.. many people put '--' in plain text
  2161           * so only detect if input ends with '--', e.g. 1-- but not 1-- foo
  2162           */
  2163          if ((sql_state->tokenvec[1].len > 2)
  2164              && sql_state->tokenvec[1].val[0] == '-') {
  2165              sql_state->reason = __LINE__;
  2166              return FALSE;
  2167          }
  2168  
  2169          break;
  2170      } /* case 2 */
  2171      case 3:{
  2172          /*
  2173           * ...foo' + 'bar...
  2174           * no opening quote, no closing quote
  2175           * and each string has data
  2176           */
  2177  
  2178          if (streq(sql_state->fingerprint, "sos")
  2179              || streq(sql_state->fingerprint, "s&s")) {
  2180  
  2181                  if ((sql_state->tokenvec[0].str_open == CHAR_NULL)
  2182                      && (sql_state->tokenvec[2].str_close == CHAR_NULL)
  2183                      && (sql_state->tokenvec[0].str_close == sql_state->tokenvec[2].str_open)) {
  2184                      /*
  2185                       * if ....foo" + "bar....
  2186                       */
  2187                      sql_state->reason = __LINE__;
  2188                      return TRUE;
  2189                  }
  2190                  if (sql_state->stats_tokens == 3) {
  2191                      sql_state->reason = __LINE__;
  2192                      return FALSE;
  2193                  }
  2194  
  2195                  /*
  2196                   * not SQLi
  2197                   */
  2198                  sql_state->reason = __LINE__;
  2199                  return FALSE;
  2200          } else if (streq(sql_state->fingerprint, "s&n") ||
  2201                     streq(sql_state->fingerprint, "n&1") ||
  2202                     streq(sql_state->fingerprint, "1&1") ||
  2203                     streq(sql_state->fingerprint, "1&v") ||
  2204                     streq(sql_state->fingerprint, "1&s")) {
  2205              /* 'sexy and 17' not SQLi
  2206               * 'sexy and 17<18'  SQLi
  2207               */
  2208              if (sql_state->stats_tokens == 3) {
  2209                  sql_state->reason = __LINE__;
  2210                  return FALSE;
  2211              }
  2212          } else if (sql_state->tokenvec[1].type == TYPE_KEYWORD) {
  2213              if ((sql_state->tokenvec[1].len < 5) ||
  2214                  cstrcasecmp("INTO", sql_state->tokenvec[1].val, 4)) {
  2215                  /* if it's not "INTO OUTFILE", or "INTO DUMPFILE" (MySQL)
  2216                   * then treat as safe
  2217                   */
  2218                  sql_state->reason = __LINE__;
  2219                  return FALSE;
  2220              }
  2221          }
  2222          break;
  2223      }  /* case 3 */
  2224      case 4:
  2225      case 5: {
  2226          /* nothing right now */
  2227          break;
  2228      } /* case 5 */
  2229      } /* end switch */
  2230  
  2231      return TRUE;
  2232  }
  2233  
  2234  /**  Main API, detects SQLi in an input.
  2235   *
  2236   *
  2237   */
  2238  static int reparse_as_mysql(struct libinjection_sqli_state * sql_state)
  2239  {
  2240      return sql_state->stats_comment_ddx ||
  2241          sql_state->stats_comment_hash;
  2242  }
  2243  
  2244  /*
  2245   * This function is mostly use with SWIG
  2246   */
  2247  struct libinjection_sqli_token*
  2248  libinjection_sqli_get_token(struct libinjection_sqli_state * sql_state, int i)
  2249  {
  2250      if (i < 0 || i > (int)LIBINJECTION_SQLI_MAX_TOKENS) {
  2251          return NULL;
  2252      }
  2253      return &(sql_state->tokenvec[i]);
  2254  }
  2255  
  2256  int libinjection_is_sqli(struct libinjection_sqli_state * sql_state)
  2257  {
  2258      const char *s = sql_state->s;
  2259      size_t slen = sql_state->slen;
  2260  
  2261      /*
  2262       * no input? not SQLi
  2263       */
  2264      if (slen == 0) {
  2265          return FALSE;
  2266      }
  2267  
  2268      /*
  2269       * test input "as-is"
  2270       */
  2271      libinjection_sqli_fingerprint(sql_state, FLAG_QUOTE_NONE | FLAG_SQL_ANSI);
  2272      if (sql_state->lookup(sql_state, LOOKUP_FINGERPRINT,
  2273                            sql_state->fingerprint, strlen(sql_state->fingerprint))) {
  2274          return TRUE;
  2275      } else if (reparse_as_mysql(sql_state)) {
  2276          libinjection_sqli_fingerprint(sql_state, FLAG_QUOTE_NONE | FLAG_SQL_MYSQL);
  2277          if (sql_state->lookup(sql_state, LOOKUP_FINGERPRINT,
  2278                                sql_state->fingerprint, strlen(sql_state->fingerprint))) {
  2279              return TRUE;
  2280          }
  2281      }
  2282  
  2283      /*
  2284       * if input has a single_quote, then
  2285       * test as if input was actually '
  2286       * example: if input if "1' = 1", then pretend it's
  2287       *   "'1' = 1"
  2288       * Porting Notes: example the same as doing
  2289       *   is_string_sqli(sql_state, "'" + s, slen+1, NULL, fn, arg)
  2290       *
  2291       */
  2292      if (memchr(s, CHAR_SINGLE, slen)) {
  2293          libinjection_sqli_fingerprint(sql_state, FLAG_QUOTE_SINGLE | FLAG_SQL_ANSI);
  2294          if (sql_state->lookup(sql_state, LOOKUP_FINGERPRINT,
  2295                                sql_state->fingerprint, strlen(sql_state->fingerprint))) {
  2296              return TRUE;
  2297          } else if (reparse_as_mysql(sql_state)) {
  2298              libinjection_sqli_fingerprint(sql_state, FLAG_QUOTE_SINGLE | FLAG_SQL_MYSQL);
  2299              if (sql_state->lookup(sql_state, LOOKUP_FINGERPRINT,
  2300                                    sql_state->fingerprint, strlen(sql_state->fingerprint))) {
  2301                  return TRUE;
  2302              }
  2303          }
  2304      }
  2305  
  2306      /*
  2307       * same as above but with a double-quote "
  2308       */
  2309      if (memchr(s, CHAR_DOUBLE, slen)) {
  2310          libinjection_sqli_fingerprint(sql_state, FLAG_QUOTE_DOUBLE | FLAG_SQL_MYSQL);
  2311          if (sql_state->lookup(sql_state, LOOKUP_FINGERPRINT,
  2312                                sql_state->fingerprint, strlen(sql_state->fingerprint))) {
  2313              return TRUE;
  2314          }
  2315      }
  2316  
  2317      /*
  2318       * Hurray, input is not SQLi
  2319       */
  2320      return FALSE;
  2321  }
  2322  
  2323  int libinjection_sqli(const char* s, size_t slen, char fingerprint[])
  2324  {
  2325      int issqli;
  2326      struct libinjection_sqli_state state;
  2327  
  2328      libinjection_sqli_init(&state, s, slen, 0);
  2329      issqli = libinjection_is_sqli(&state);
  2330      if (issqli) {
  2331          strcpy(fingerprint, state.fingerprint);
  2332      } else {
  2333          fingerprint[0] = '\0';
  2334      }
  2335      return issqli;
  2336  }