modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/misc/regexp.c

modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/misc/regexp.c (about)

     1  /*
     2  ** 2012-11-13
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  ******************************************************************************
    12  **
    13  ** The code in this file implements a compact but reasonably
    14  ** efficient regular-expression matcher for posix extended regular
    15  ** expressions against UTF8 text.
    16  **
    17  ** This file is an SQLite extension.  It registers a single function
    18  ** named "regexp(A,B)" where A is the regular expression and B is the
    19  ** string to be matched.  By registering this function, SQLite will also
    20  ** then implement the "B regexp A" operator.  Note that with the function
    21  ** the regular expression comes first, but with the operator it comes
    22  ** second.
    23  **
    24  **  The following regular expression syntax is supported:
    25  **
    26  **     X*      zero or more occurrences of X
    27  **     X+      one or more occurrences of X
    28  **     X?      zero or one occurrences of X
    29  **     X{p,q}  between p and q occurrences of X
    30  **     (X)     match X
    31  **     X|Y     X or Y
    32  **     ^X      X occurring at the beginning of the string
    33  **     X$      X occurring at the end of the string
    34  **     .       Match any single character
    35  **     \c      Character c where c is one of \{}()[]|*+?.
    36  **     \c      C-language escapes for c in afnrtv.  ex: \t or \n
    37  **     \uXXXX  Where XXXX is exactly 4 hex digits, unicode value XXXX
    38  **     \xXX    Where XX is exactly 2 hex digits, unicode value XX
    39  **     [abc]   Any single character from the set abc
    40  **     [^abc]  Any single character not in the set abc
    41  **     [a-z]   Any single character in the range a-z
    42  **     [^a-z]  Any single character not in the range a-z
    43  **     \b      Word boundary
    44  **     \w      Word character.  [A-Za-z0-9_]
    45  **     \W      Non-word character
    46  **     \d      Digit
    47  **     \D      Non-digit
    48  **     \s      Whitespace character
    49  **     \S      Non-whitespace character
    50  **
    51  ** A nondeterministic finite automaton (NFA) is used for matching, so the
    52  ** performance is bounded by O(N*M) where N is the size of the regular
    53  ** expression and M is the size of the input string.  The matcher never
    54  ** exhibits exponential behavior.  Note that the X{p,q} operator expands
    55  ** to p copies of X following by q-p copies of X? and that the size of the
    56  ** regular expression in the O(N*M) performance bound is computed after
    57  ** this expansion.
    58  */
    59  #include <string.h>
    60  #include <stdlib.h>
    61  #include "sqlite3ext.h"
    62  SQLITE_EXTENSION_INIT1
    63  
    64  /*
    65  ** The following #defines change the names of some functions implemented in
    66  ** this file to prevent name collisions with C-library functions of the
    67  ** same name.
    68  */
    69  #define re_match   sqlite3re_match
    70  #define re_compile sqlite3re_compile
    71  #define re_free    sqlite3re_free
    72  
    73  /* The end-of-input character */
    74  #define RE_EOF            0    /* End of input */
    75  
    76  /* The NFA is implemented as sequence of opcodes taken from the following
    77  ** set.  Each opcode has a single integer argument.
    78  */
    79  #define RE_OP_MATCH       1    /* Match the one character in the argument */
    80  #define RE_OP_ANY         2    /* Match any one character.  (Implements ".") */
    81  #define RE_OP_ANYSTAR     3    /* Special optimized version of .* */
    82  #define RE_OP_FORK        4    /* Continue to both next and opcode at iArg */
    83  #define RE_OP_GOTO        5    /* Jump to opcode at iArg */
    84  #define RE_OP_ACCEPT      6    /* Halt and indicate a successful match */
    85  #define RE_OP_CC_INC      7    /* Beginning of a [...] character class */
    86  #define RE_OP_CC_EXC      8    /* Beginning of a [^...] character class */
    87  #define RE_OP_CC_VALUE    9    /* Single value in a character class */
    88  #define RE_OP_CC_RANGE   10    /* Range of values in a character class */
    89  #define RE_OP_WORD       11    /* Perl word character [A-Za-z0-9_] */
    90  #define RE_OP_NOTWORD    12    /* Not a perl word character */
    91  #define RE_OP_DIGIT      13    /* digit:  [0-9] */
    92  #define RE_OP_NOTDIGIT   14    /* Not a digit */
    93  #define RE_OP_SPACE      15    /* space:  [ \t\n\r\v\f] */
    94  #define RE_OP_NOTSPACE   16    /* Not a digit */
    95  #define RE_OP_BOUNDARY   17    /* Boundary between word and non-word */
    96  
    97  /* Each opcode is a "state" in the NFA */
    98  typedef unsigned short ReStateNumber;
    99  
   100  /* Because this is an NFA and not a DFA, multiple states can be active at
   101  ** once.  An instance of the following object records all active states in
   102  ** the NFA.  The implementation is optimized for the common case where the
   103  ** number of actives states is small.
   104  */
   105  typedef struct ReStateSet {
   106    unsigned nState;            /* Number of current states */
   107    ReStateNumber *aState;      /* Current states */
   108  } ReStateSet;
   109  
   110  /* An input string read one character at a time.
   111  */
   112  typedef struct ReInput ReInput;
   113  struct ReInput {
   114    const unsigned char *z;  /* All text */
   115    int i;                   /* Next byte to read */
   116    int mx;                  /* EOF when i>=mx */
   117  };
   118  
   119  /* A compiled NFA (or an NFA that is in the process of being compiled) is
   120  ** an instance of the following object.
   121  */
   122  typedef struct ReCompiled ReCompiled;
   123  struct ReCompiled {
   124    ReInput sIn;                /* Regular expression text */
   125    const char *zErr;           /* Error message to return */
   126    char *aOp;                  /* Operators for the virtual machine */
   127    int *aArg;                  /* Arguments to each operator */
   128    unsigned (*xNextChar)(ReInput*);  /* Next character function */
   129    unsigned char zInit[12];    /* Initial text to match */
   130    int nInit;                  /* Number of characters in zInit */
   131    unsigned nState;            /* Number of entries in aOp[] and aArg[] */
   132    unsigned nAlloc;            /* Slots allocated for aOp[] and aArg[] */
   133  };
   134  
   135  /* Add a state to the given state set if it is not already there */
   136  static void re_add_state(ReStateSet *pSet, int newState){
   137    unsigned i;
   138    for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
   139    pSet->aState[pSet->nState++] = (ReStateNumber)newState;
   140  }
   141  
   142  /* Extract the next unicode character from *pzIn and return it.  Advance
   143  ** *pzIn to the first byte past the end of the character returned.  To
   144  ** be clear:  this routine converts utf8 to unicode.  This routine is 
   145  ** optimized for the common case where the next character is a single byte.
   146  */
   147  static unsigned re_next_char(ReInput *p){
   148    unsigned c;
   149    if( p->i>=p->mx ) return 0;
   150    c = p->z[p->i++];
   151    if( c>=0x80 ){
   152      if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
   153        c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
   154        if( c<0x80 ) c = 0xfffd;
   155      }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
   156             && (p->z[p->i+1]&0xc0)==0x80 ){
   157        c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
   158        p->i += 2;
   159        if( c<=0x3ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
   160      }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
   161             && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
   162        c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
   163                         | (p->z[p->i+2]&0x3f);
   164        p->i += 3;
   165        if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
   166      }else{
   167        c = 0xfffd;
   168      }
   169    }
   170    return c;
   171  }
   172  static unsigned re_next_char_nocase(ReInput *p){
   173    unsigned c = re_next_char(p);
   174    if( c>='A' && c<='Z' ) c += 'a' - 'A';
   175    return c;
   176  }
   177  
   178  /* Return true if c is a perl "word" character:  [A-Za-z0-9_] */
   179  static int re_word_char(int c){
   180    return (c>='0' && c<='9') || (c>='a' && c<='z')
   181        || (c>='A' && c<='Z') || c=='_';
   182  }
   183  
   184  /* Return true if c is a "digit" character:  [0-9] */
   185  static int re_digit_char(int c){
   186    return (c>='0' && c<='9');
   187  }
   188  
   189  /* Return true if c is a perl "space" character:  [ \t\r\n\v\f] */
   190  static int re_space_char(int c){
   191    return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
   192  }
   193  
   194  /* Run a compiled regular expression on the zero-terminated input
   195  ** string zIn[].  Return true on a match and false if there is no match.
   196  */
   197  static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
   198    ReStateSet aStateSet[2], *pThis, *pNext;
   199    ReStateNumber aSpace[100];
   200    ReStateNumber *pToFree;
   201    unsigned int i = 0;
   202    unsigned int iSwap = 0;
   203    int c = RE_EOF+1;
   204    int cPrev = 0;
   205    int rc = 0;
   206    ReInput in;
   207  
   208    in.z = zIn;
   209    in.i = 0;
   210    in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
   211  
   212    /* Look for the initial prefix match, if there is one. */
   213    if( pRe->nInit ){
   214      unsigned char x = pRe->zInit[0];
   215      while( in.i+pRe->nInit<=in.mx 
   216       && (zIn[in.i]!=x ||
   217           strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
   218      ){
   219        in.i++;
   220      }
   221      if( in.i+pRe->nInit>in.mx ) return 0;
   222    }
   223  
   224    if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
   225      pToFree = 0;
   226      aStateSet[0].aState = aSpace;
   227    }else{
   228      pToFree = sqlite3_malloc( sizeof(ReStateNumber)*2*pRe->nState );
   229      if( pToFree==0 ) return -1;
   230      aStateSet[0].aState = pToFree;
   231    }
   232    aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
   233    pNext = &aStateSet[1];
   234    pNext->nState = 0;
   235    re_add_state(pNext, 0);
   236    while( c!=RE_EOF && pNext->nState>0 ){
   237      cPrev = c;
   238      c = pRe->xNextChar(&in);
   239      pThis = pNext;
   240      pNext = &aStateSet[iSwap];
   241      iSwap = 1 - iSwap;
   242      pNext->nState = 0;
   243      for(i=0; i<pThis->nState; i++){
   244        int x = pThis->aState[i];
   245        switch( pRe->aOp[x] ){
   246          case RE_OP_MATCH: {
   247            if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
   248            break;
   249          }
   250          case RE_OP_ANY: {
   251            re_add_state(pNext, x+1);
   252            break;
   253          }
   254          case RE_OP_WORD: {
   255            if( re_word_char(c) ) re_add_state(pNext, x+1);
   256            break;
   257          }
   258          case RE_OP_NOTWORD: {
   259            if( !re_word_char(c) ) re_add_state(pNext, x+1);
   260            break;
   261          }
   262          case RE_OP_DIGIT: {
   263            if( re_digit_char(c) ) re_add_state(pNext, x+1);
   264            break;
   265          }
   266          case RE_OP_NOTDIGIT: {
   267            if( !re_digit_char(c) ) re_add_state(pNext, x+1);
   268            break;
   269          }
   270          case RE_OP_SPACE: {
   271            if( re_space_char(c) ) re_add_state(pNext, x+1);
   272            break;
   273          }
   274          case RE_OP_NOTSPACE: {
   275            if( !re_space_char(c) ) re_add_state(pNext, x+1);
   276            break;
   277          }
   278          case RE_OP_BOUNDARY: {
   279            if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
   280            break;
   281          }
   282          case RE_OP_ANYSTAR: {
   283            re_add_state(pNext, x);
   284            re_add_state(pThis, x+1);
   285            break;
   286          }
   287          case RE_OP_FORK: {
   288            re_add_state(pThis, x+pRe->aArg[x]);
   289            re_add_state(pThis, x+1);
   290            break;
   291          }
   292          case RE_OP_GOTO: {
   293            re_add_state(pThis, x+pRe->aArg[x]);
   294            break;
   295          }
   296          case RE_OP_ACCEPT: {
   297            rc = 1;
   298            goto re_match_end;
   299          }
   300          case RE_OP_CC_INC:
   301          case RE_OP_CC_EXC: {
   302            int j = 1;
   303            int n = pRe->aArg[x];
   304            int hit = 0;
   305            for(j=1; j>0 && j<n; j++){
   306              if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
   307                if( pRe->aArg[x+j]==c ){
   308                  hit = 1;
   309                  j = -1;
   310                }
   311              }else{
   312                if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
   313                  hit = 1;
   314                  j = -1;
   315                }else{
   316                  j++;
   317                }
   318              }
   319            }
   320            if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
   321            if( hit ) re_add_state(pNext, x+n);
   322            break;            
   323          }
   324        }
   325      }
   326    }
   327    for(i=0; i<pNext->nState; i++){
   328      if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
   329    }
   330  re_match_end:
   331    sqlite3_free(pToFree);
   332    return rc;
   333  }
   334  
   335  /* Resize the opcode and argument arrays for an RE under construction.
   336  */
   337  static int re_resize(ReCompiled *p, int N){
   338    char *aOp;
   339    int *aArg;
   340    aOp = sqlite3_realloc(p->aOp, N*sizeof(p->aOp[0]));
   341    if( aOp==0 ) return 1;
   342    p->aOp = aOp;
   343    aArg = sqlite3_realloc(p->aArg, N*sizeof(p->aArg[0]));
   344    if( aArg==0 ) return 1;
   345    p->aArg = aArg;
   346    p->nAlloc = N;
   347    return 0;
   348  }
   349  
   350  /* Insert a new opcode and argument into an RE under construction.  The
   351  ** insertion point is just prior to existing opcode iBefore.
   352  */
   353  static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
   354    int i;
   355    if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
   356    for(i=p->nState; i>iBefore; i--){
   357      p->aOp[i] = p->aOp[i-1];
   358      p->aArg[i] = p->aArg[i-1];
   359    }
   360    p->nState++;
   361    p->aOp[iBefore] = (char)op;
   362    p->aArg[iBefore] = arg;
   363    return iBefore;
   364  }
   365  
   366  /* Append a new opcode and argument to the end of the RE under construction.
   367  */
   368  static int re_append(ReCompiled *p, int op, int arg){
   369    return re_insert(p, p->nState, op, arg);
   370  }
   371  
   372  /* Make a copy of N opcodes starting at iStart onto the end of the RE
   373  ** under construction.
   374  */
   375  static void re_copy(ReCompiled *p, int iStart, int N){
   376    if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
   377    memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
   378    memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
   379    p->nState += N;
   380  }
   381  
   382  /* Return true if c is a hexadecimal digit character:  [0-9a-fA-F]
   383  ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c).  If
   384  ** c is not a hex digit *pV is unchanged.
   385  */
   386  static int re_hex(int c, int *pV){
   387    if( c>='0' && c<='9' ){
   388      c -= '0';
   389    }else if( c>='a' && c<='f' ){
   390      c -= 'a' - 10;
   391    }else if( c>='A' && c<='F' ){
   392      c -= 'A' - 10;
   393    }else{
   394      return 0;
   395    }
   396    *pV = (*pV)*16 + (c & 0xff);
   397    return 1;
   398  }
   399  
   400  /* A backslash character has been seen, read the next character and
   401  ** return its interpretation.
   402  */
   403  static unsigned re_esc_char(ReCompiled *p){
   404    static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
   405    static const char zTrans[] = "\a\f\n\r\t\v";
   406    int i, v = 0;
   407    char c;
   408    if( p->sIn.i>=p->sIn.mx ) return 0;
   409    c = p->sIn.z[p->sIn.i];
   410    if( c=='u' && p->sIn.i+4<p->sIn.mx ){
   411      const unsigned char *zIn = p->sIn.z + p->sIn.i;
   412      if( re_hex(zIn[1],&v)
   413       && re_hex(zIn[2],&v)
   414       && re_hex(zIn[3],&v)
   415       && re_hex(zIn[4],&v)
   416      ){
   417        p->sIn.i += 5;
   418        return v;
   419      }
   420    }
   421    if( c=='x' && p->sIn.i+2<p->sIn.mx ){
   422      const unsigned char *zIn = p->sIn.z + p->sIn.i;
   423      if( re_hex(zIn[1],&v)
   424       && re_hex(zIn[2],&v)
   425      ){
   426        p->sIn.i += 3;
   427        return v;
   428      }
   429    }
   430    for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
   431    if( zEsc[i] ){
   432      if( i<6 ) c = zTrans[i];
   433      p->sIn.i++;
   434    }else{
   435      p->zErr = "unknown \\ escape";
   436    }
   437    return c;
   438  }
   439  
   440  /* Forward declaration */
   441  static const char *re_subcompile_string(ReCompiled*);
   442  
   443  /* Peek at the next byte of input */
   444  static unsigned char rePeek(ReCompiled *p){
   445    return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
   446  }
   447  
   448  /* Compile RE text into a sequence of opcodes.  Continue up to the
   449  ** first unmatched ")" character, then return.  If an error is found,
   450  ** return a pointer to the error message string.
   451  */
   452  static const char *re_subcompile_re(ReCompiled *p){
   453    const char *zErr;
   454    int iStart, iEnd, iGoto;
   455    iStart = p->nState;
   456    zErr = re_subcompile_string(p);
   457    if( zErr ) return zErr;
   458    while( rePeek(p)=='|' ){
   459      iEnd = p->nState;
   460      re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
   461      iGoto = re_append(p, RE_OP_GOTO, 0);
   462      p->sIn.i++;
   463      zErr = re_subcompile_string(p);
   464      if( zErr ) return zErr;
   465      p->aArg[iGoto] = p->nState - iGoto;
   466    }
   467    return 0;
   468  }
   469  
   470  /* Compile an element of regular expression text (anything that can be
   471  ** an operand to the "|" operator).  Return NULL on success or a pointer
   472  ** to the error message if there is a problem.
   473  */
   474  static const char *re_subcompile_string(ReCompiled *p){
   475    int iPrev = -1;
   476    int iStart;
   477    unsigned c;
   478    const char *zErr;
   479    while( (c = p->xNextChar(&p->sIn))!=0 ){
   480      iStart = p->nState;
   481      switch( c ){
   482        case '|':
   483        case '$': 
   484        case ')': {
   485          p->sIn.i--;
   486          return 0;
   487        }
   488        case '(': {
   489          zErr = re_subcompile_re(p);
   490          if( zErr ) return zErr;
   491          if( rePeek(p)!=')' ) return "unmatched '('";
   492          p->sIn.i++;
   493          break;
   494        }
   495        case '.': {
   496          if( rePeek(p)=='*' ){
   497            re_append(p, RE_OP_ANYSTAR, 0);
   498            p->sIn.i++;
   499          }else{ 
   500            re_append(p, RE_OP_ANY, 0);
   501          }
   502          break;
   503        }
   504        case '*': {
   505          if( iPrev<0 ) return "'*' without operand";
   506          re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
   507          re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
   508          break;
   509        }
   510        case '+': {
   511          if( iPrev<0 ) return "'+' without operand";
   512          re_append(p, RE_OP_FORK, iPrev - p->nState);
   513          break;
   514        }
   515        case '?': {
   516          if( iPrev<0 ) return "'?' without operand";
   517          re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
   518          break;
   519        }
   520        case '{': {
   521          int m = 0, n = 0;
   522          int sz, j;
   523          if( iPrev<0 ) return "'{m,n}' without operand";
   524          while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }
   525          n = m;
   526          if( c==',' ){
   527            p->sIn.i++;
   528            n = 0;
   529            while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }
   530          }
   531          if( c!='}' ) return "unmatched '{'";
   532          if( n>0 && n<m ) return "n less than m in '{m,n}'";
   533          p->sIn.i++;
   534          sz = p->nState - iPrev;
   535          if( m==0 ){
   536            if( n==0 ) return "both m and n are zero in '{m,n}'";
   537            re_insert(p, iPrev, RE_OP_FORK, sz+1);
   538            n--;
   539          }else{
   540            for(j=1; j<m; j++) re_copy(p, iPrev, sz);
   541          }
   542          for(j=m; j<n; j++){
   543            re_append(p, RE_OP_FORK, sz+1);
   544            re_copy(p, iPrev, sz);
   545          }
   546          if( n==0 && m>0 ){
   547            re_append(p, RE_OP_FORK, -sz);
   548          }
   549          break;
   550        }
   551        case '[': {
   552          int iFirst = p->nState;
   553          if( rePeek(p)=='^' ){
   554            re_append(p, RE_OP_CC_EXC, 0);
   555            p->sIn.i++;
   556          }else{
   557            re_append(p, RE_OP_CC_INC, 0);
   558          }
   559          while( (c = p->xNextChar(&p->sIn))!=0 ){
   560            if( c=='[' && rePeek(p)==':' ){
   561              return "POSIX character classes not supported";
   562            }
   563            if( c=='\\' ) c = re_esc_char(p);
   564            if( rePeek(p)=='-' ){
   565              re_append(p, RE_OP_CC_RANGE, c);
   566              p->sIn.i++;
   567              c = p->xNextChar(&p->sIn);
   568              if( c=='\\' ) c = re_esc_char(p);
   569              re_append(p, RE_OP_CC_RANGE, c);
   570            }else{
   571              re_append(p, RE_OP_CC_VALUE, c);
   572            }
   573            if( rePeek(p)==']' ){ p->sIn.i++; break; }
   574          }
   575          if( c==0 ) return "unclosed '['";
   576          p->aArg[iFirst] = p->nState - iFirst;
   577          break;
   578        }
   579        case '\\': {
   580          int specialOp = 0;
   581          switch( rePeek(p) ){
   582            case 'b': specialOp = RE_OP_BOUNDARY;   break;
   583            case 'd': specialOp = RE_OP_DIGIT;      break;
   584            case 'D': specialOp = RE_OP_NOTDIGIT;   break;
   585            case 's': specialOp = RE_OP_SPACE;      break;
   586            case 'S': specialOp = RE_OP_NOTSPACE;   break;
   587            case 'w': specialOp = RE_OP_WORD;       break;
   588            case 'W': specialOp = RE_OP_NOTWORD;    break;
   589          }
   590          if( specialOp ){
   591            p->sIn.i++;
   592            re_append(p, specialOp, 0);
   593          }else{
   594            c = re_esc_char(p);
   595            re_append(p, RE_OP_MATCH, c);
   596          }
   597          break;
   598        }
   599        default: {
   600          re_append(p, RE_OP_MATCH, c);
   601          break;
   602        }
   603      }
   604      iPrev = iStart;
   605    }
   606    return 0;
   607  }
   608  
   609  /* Free and reclaim all the memory used by a previously compiled
   610  ** regular expression.  Applications should invoke this routine once
   611  ** for every call to re_compile() to avoid memory leaks.
   612  */
   613  void re_free(ReCompiled *pRe){
   614    if( pRe ){
   615      sqlite3_free(pRe->aOp);
   616      sqlite3_free(pRe->aArg);
   617      sqlite3_free(pRe);
   618    }
   619  }
   620  
   621  /*
   622  ** Compile a textual regular expression in zIn[] into a compiled regular
   623  ** expression suitable for us by re_match() and return a pointer to the
   624  ** compiled regular expression in *ppRe.  Return NULL on success or an
   625  ** error message if something goes wrong.
   626  */
   627  const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
   628    ReCompiled *pRe;
   629    const char *zErr;
   630    int i, j;
   631  
   632    *ppRe = 0;
   633    pRe = sqlite3_malloc( sizeof(*pRe) );
   634    if( pRe==0 ){
   635      return "out of memory";
   636    }
   637    memset(pRe, 0, sizeof(*pRe));
   638    pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
   639    if( re_resize(pRe, 30) ){
   640      re_free(pRe);
   641      return "out of memory";
   642    }
   643    if( zIn[0]=='^' ){
   644      zIn++;
   645    }else{
   646      re_append(pRe, RE_OP_ANYSTAR, 0);
   647    }
   648    pRe->sIn.z = (unsigned char*)zIn;
   649    pRe->sIn.i = 0;
   650    pRe->sIn.mx = (int)strlen(zIn);
   651    zErr = re_subcompile_re(pRe);
   652    if( zErr ){
   653      re_free(pRe);
   654      return zErr;
   655    }
   656    if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){
   657      re_append(pRe, RE_OP_MATCH, RE_EOF);
   658      re_append(pRe, RE_OP_ACCEPT, 0);
   659      *ppRe = pRe;
   660    }else if( pRe->sIn.i>=pRe->sIn.mx ){
   661      re_append(pRe, RE_OP_ACCEPT, 0);
   662      *ppRe = pRe;
   663    }else{
   664      re_free(pRe);
   665      return "unrecognized character";
   666    }
   667  
   668    /* The following is a performance optimization.  If the regex begins with
   669    ** ".*" (if the input regex lacks an initial "^") and afterwards there are
   670    ** one or more matching characters, enter those matching characters into
   671    ** zInit[].  The re_match() routine can then search ahead in the input 
   672    ** string looking for the initial match without having to run the whole
   673    ** regex engine over the string.  Do not worry able trying to match
   674    ** unicode characters beyond plane 0 - those are very rare and this is
   675    ** just an optimization. */
   676    if( pRe->aOp[0]==RE_OP_ANYSTAR ){
   677      for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
   678        unsigned x = pRe->aArg[i];
   679        if( x<=127 ){
   680          pRe->zInit[j++] = (unsigned char)x;
   681        }else if( x<=0xfff ){
   682          pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
   683          pRe->zInit[j++] = 0x80 | (x&0x3f);
   684        }else if( x<=0xffff ){
   685          pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12));
   686          pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
   687          pRe->zInit[j++] = 0x80 | (x&0x3f);
   688        }else{
   689          break;
   690        }
   691      }
   692      if( j>0 && pRe->zInit[j-1]==0 ) j--;
   693      pRe->nInit = j;
   694    }
   695    return pRe->zErr;
   696  }
   697  
   698  /*
   699  ** Implementation of the regexp() SQL function.  This function implements
   700  ** the build-in REGEXP operator.  The first argument to the function is the
   701  ** pattern and the second argument is the string.  So, the SQL statements:
   702  **
   703  **       A REGEXP B
   704  **
   705  ** is implemented as regexp(B,A).
   706  */
   707  static void re_sql_func(
   708    sqlite3_context *context, 
   709    int argc, 
   710    sqlite3_value **argv
   711  ){
   712    ReCompiled *pRe;          /* Compiled regular expression */
   713    const char *zPattern;     /* The regular expression */
   714    const unsigned char *zStr;/* String being searched */
   715    const char *zErr;         /* Compile error message */
   716    int setAux = 0;           /* True to invoke sqlite3_set_auxdata() */
   717  
   718    pRe = sqlite3_get_auxdata(context, 0);
   719    if( pRe==0 ){
   720      zPattern = (const char*)sqlite3_value_text(argv[0]);
   721      if( zPattern==0 ) return;
   722      zErr = re_compile(&pRe, zPattern, 0);
   723      if( zErr ){
   724        re_free(pRe);
   725        sqlite3_result_error(context, zErr, -1);
   726        return;
   727      }
   728      if( pRe==0 ){
   729        sqlite3_result_error_nomem(context);
   730        return;
   731      }
   732      setAux = 1;
   733    }
   734    zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
   735    if( zStr!=0 ){
   736      sqlite3_result_int(context, re_match(pRe, zStr, -1));
   737    }
   738    if( setAux ){
   739      sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
   740    }
   741  }
   742  
   743  /*
   744  ** Invoke this routine to register the regexp() function with the
   745  ** SQLite database connection.
   746  */
   747  #ifdef _WIN32
   748  __declspec(dllexport)
   749  #endif
   750  int sqlite3_regexp_init(
   751    sqlite3 *db, 
   752    char **pzErrMsg, 
   753    const sqlite3_api_routines *pApi
   754  ){
   755    int rc = SQLITE_OK;
   756    SQLITE_EXTENSION_INIT2(pApi);
   757    rc = sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,
   758                                   re_sql_func, 0, 0);
   759    return rc;
   760  }