modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/fts5_tokenize.c (about)

     1  /*
     2  ** 2014 May 31
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  ******************************************************************************
    12  */
    13  
    14  
    15  #include "fts5Int.h"
    16  
    17  /**************************************************************************
    18  ** Start of ascii tokenizer implementation.
    19  */
    20  
    21  /*
    22  ** For tokenizers with no "unicode" modifier, the set of token characters
    23  ** is the same as the set of ASCII range alphanumeric characters. 
    24  */
    25  static unsigned char aAsciiTokenChar[128] = {
    26    0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
    27    0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
    28    0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
    29    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
    30    0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
    31    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
    32    0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
    33    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
    34  };
    35  
    36  typedef struct AsciiTokenizer AsciiTokenizer;
    37  struct AsciiTokenizer {
    38    unsigned char aTokenChar[128];
    39  };
    40  
    41  static void fts5AsciiAddExceptions(
    42    AsciiTokenizer *p, 
    43    const char *zArg, 
    44    int bTokenChars
    45  ){
    46    int i;
    47    for(i=0; zArg[i]; i++){
    48      if( (zArg[i] & 0x80)==0 ){
    49        p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
    50      }
    51    }
    52  }
    53  
    54  /*
    55  ** Delete a "ascii" tokenizer.
    56  */
    57  static void fts5AsciiDelete(Fts5Tokenizer *p){
    58    sqlite3_free(p);
    59  }
    60  
    61  /*
    62  ** Create an "ascii" tokenizer.
    63  */
    64  static int fts5AsciiCreate(
    65    void *pUnused, 
    66    const char **azArg, int nArg,
    67    Fts5Tokenizer **ppOut
    68  ){
    69    int rc = SQLITE_OK;
    70    AsciiTokenizer *p = 0;
    71    UNUSED_PARAM(pUnused);
    72    if( nArg%2 ){
    73      rc = SQLITE_ERROR;
    74    }else{
    75      p = sqlite3_malloc(sizeof(AsciiTokenizer));
    76      if( p==0 ){
    77        rc = SQLITE_NOMEM;
    78      }else{
    79        int i;
    80        memset(p, 0, sizeof(AsciiTokenizer));
    81        memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
    82        for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
    83          const char *zArg = azArg[i+1];
    84          if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
    85            fts5AsciiAddExceptions(p, zArg, 1);
    86          }else
    87          if( 0==sqlite3_stricmp(azArg[i], "separators") ){
    88            fts5AsciiAddExceptions(p, zArg, 0);
    89          }else{
    90            rc = SQLITE_ERROR;
    91          }
    92        }
    93        if( rc!=SQLITE_OK ){
    94          fts5AsciiDelete((Fts5Tokenizer*)p);
    95          p = 0;
    96        }
    97      }
    98    }
    99  
   100    *ppOut = (Fts5Tokenizer*)p;
   101    return rc;
   102  }
   103  
   104  
   105  static void asciiFold(char *aOut, const char *aIn, int nByte){
   106    int i;
   107    for(i=0; i<nByte; i++){
   108      char c = aIn[i];
   109      if( c>='A' && c<='Z' ) c += 32;
   110      aOut[i] = c;
   111    }
   112  }
   113  
   114  /*
   115  ** Tokenize some text using the ascii tokenizer.
   116  */
   117  static int fts5AsciiTokenize(
   118    Fts5Tokenizer *pTokenizer,
   119    void *pCtx,
   120    int iUnused,
   121    const char *pText, int nText,
   122    int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
   123  ){
   124    AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
   125    int rc = SQLITE_OK;
   126    int ie;
   127    int is = 0;
   128  
   129    char aFold[64];
   130    int nFold = sizeof(aFold);
   131    char *pFold = aFold;
   132    unsigned char *a = p->aTokenChar;
   133  
   134    UNUSED_PARAM(iUnused);
   135  
   136    while( is<nText && rc==SQLITE_OK ){
   137      int nByte;
   138  
   139      /* Skip any leading divider characters. */
   140      while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
   141        is++;
   142      }
   143      if( is==nText ) break;
   144  
   145      /* Count the token characters */
   146      ie = is+1;
   147      while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
   148        ie++;
   149      }
   150  
   151      /* Fold to lower case */
   152      nByte = ie-is;
   153      if( nByte>nFold ){
   154        if( pFold!=aFold ) sqlite3_free(pFold);
   155        pFold = sqlite3_malloc(nByte*2);
   156        if( pFold==0 ){
   157          rc = SQLITE_NOMEM;
   158          break;
   159        }
   160        nFold = nByte*2;
   161      }
   162      asciiFold(pFold, &pText[is], nByte);
   163  
   164      /* Invoke the token callback */
   165      rc = xToken(pCtx, 0, pFold, nByte, is, ie);
   166      is = ie+1;
   167    }
   168    
   169    if( pFold!=aFold ) sqlite3_free(pFold);
   170    if( rc==SQLITE_DONE ) rc = SQLITE_OK;
   171    return rc;
   172  }
   173  
   174  /**************************************************************************
   175  ** Start of unicode61 tokenizer implementation.
   176  */
   177  
   178  
   179  /*
   180  ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
   181  ** from the sqlite3 source file utf.c. If this file is compiled as part
   182  ** of the amalgamation, they are not required.
   183  */
   184  #ifndef SQLITE_AMALGAMATION
   185  
   186  static const unsigned char sqlite3Utf8Trans1[] = {
   187    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
   188    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
   189    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
   190    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
   191    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
   192    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
   193    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
   194    0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
   195  };
   196  
   197  #define READ_UTF8(zIn, zTerm, c)                           \
   198    c = *(zIn++);                                            \
   199    if( c>=0xc0 ){                                           \
   200      c = sqlite3Utf8Trans1[c-0xc0];                         \
   201      while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
   202        c = (c<<6) + (0x3f & *(zIn++));                      \
   203      }                                                      \
   204      if( c<0x80                                             \
   205          || (c&0xFFFFF800)==0xD800                          \
   206          || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
   207    }
   208  
   209  
   210  #define WRITE_UTF8(zOut, c) {                          \
   211    if( c<0x00080 ){                                     \
   212      *zOut++ = (unsigned char)(c&0xFF);                 \
   213    }                                                    \
   214    else if( c<0x00800 ){                                \
   215      *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
   216      *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
   217    }                                                    \
   218    else if( c<0x10000 ){                                \
   219      *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F);    \
   220      *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
   221      *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
   222    }else{                                               \
   223      *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07);  \
   224      *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
   225      *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
   226      *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
   227    }                                                    \
   228  }
   229  
   230  #endif /* ifndef SQLITE_AMALGAMATION */
   231  
   232  typedef struct Unicode61Tokenizer Unicode61Tokenizer;
   233  struct Unicode61Tokenizer {
   234    unsigned char aTokenChar[128];  /* ASCII range token characters */
   235    char *aFold;                    /* Buffer to fold text into */
   236    int nFold;                      /* Size of aFold[] in bytes */
   237    int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
   238    int nException;
   239    int *aiException;
   240  };
   241  
   242  static int fts5UnicodeAddExceptions(
   243    Unicode61Tokenizer *p,          /* Tokenizer object */
   244    const char *z,                  /* Characters to treat as exceptions */
   245    int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
   246  ){
   247    int rc = SQLITE_OK;
   248    int n = (int)strlen(z);
   249    int *aNew;
   250  
   251    if( n>0 ){
   252      aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int));
   253      if( aNew ){
   254        int nNew = p->nException;
   255        const unsigned char *zCsr = (const unsigned char*)z;
   256        const unsigned char *zTerm = (const unsigned char*)&z[n];
   257        while( zCsr<zTerm ){
   258          int iCode;
   259          int bToken;
   260          READ_UTF8(zCsr, zTerm, iCode);
   261          if( iCode<128 ){
   262            p->aTokenChar[iCode] = (unsigned char)bTokenChars;
   263          }else{
   264            bToken = sqlite3Fts5UnicodeIsalnum(iCode);
   265            assert( (bToken==0 || bToken==1) ); 
   266            assert( (bTokenChars==0 || bTokenChars==1) );
   267            if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
   268              int i;
   269              for(i=0; i<nNew; i++){
   270                if( aNew[i]>iCode ) break;
   271              }
   272              memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
   273              aNew[i] = iCode;
   274              nNew++;
   275            }
   276          }
   277        }
   278        p->aiException = aNew;
   279        p->nException = nNew;
   280      }else{
   281        rc = SQLITE_NOMEM;
   282      }
   283    }
   284  
   285    return rc;
   286  }
   287  
   288  /*
   289  ** Return true if the p->aiException[] array contains the value iCode.
   290  */
   291  static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
   292    if( p->nException>0 ){
   293      int *a = p->aiException;
   294      int iLo = 0;
   295      int iHi = p->nException-1;
   296  
   297      while( iHi>=iLo ){
   298        int iTest = (iHi + iLo) / 2;
   299        if( iCode==a[iTest] ){
   300          return 1;
   301        }else if( iCode>a[iTest] ){
   302          iLo = iTest+1;
   303        }else{
   304          iHi = iTest-1;
   305        }
   306      }
   307    }
   308  
   309    return 0;
   310  }
   311  
   312  /*
   313  ** Delete a "unicode61" tokenizer.
   314  */
   315  static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
   316    if( pTok ){
   317      Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
   318      sqlite3_free(p->aiException);
   319      sqlite3_free(p->aFold);
   320      sqlite3_free(p);
   321    }
   322    return;
   323  }
   324  
   325  /*
   326  ** Create a "unicode61" tokenizer.
   327  */
   328  static int fts5UnicodeCreate(
   329    void *pUnused, 
   330    const char **azArg, int nArg,
   331    Fts5Tokenizer **ppOut
   332  ){
   333    int rc = SQLITE_OK;             /* Return code */
   334    Unicode61Tokenizer *p = 0;      /* New tokenizer object */ 
   335  
   336    UNUSED_PARAM(pUnused);
   337  
   338    if( nArg%2 ){
   339      rc = SQLITE_ERROR;
   340    }else{
   341      p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
   342      if( p ){
   343        int i;
   344        memset(p, 0, sizeof(Unicode61Tokenizer));
   345        memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
   346        p->bRemoveDiacritic = 1;
   347        p->nFold = 64;
   348        p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
   349        if( p->aFold==0 ){
   350          rc = SQLITE_NOMEM;
   351        }
   352        for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
   353          const char *zArg = azArg[i+1];
   354          if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
   355            if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
   356              rc = SQLITE_ERROR;
   357            }
   358            p->bRemoveDiacritic = (zArg[0]=='1');
   359          }else
   360          if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
   361            rc = fts5UnicodeAddExceptions(p, zArg, 1);
   362          }else
   363          if( 0==sqlite3_stricmp(azArg[i], "separators") ){
   364            rc = fts5UnicodeAddExceptions(p, zArg, 0);
   365          }else{
   366            rc = SQLITE_ERROR;
   367          }
   368        }
   369      }else{
   370        rc = SQLITE_NOMEM;
   371      }
   372      if( rc!=SQLITE_OK ){
   373        fts5UnicodeDelete((Fts5Tokenizer*)p);
   374        p = 0;
   375      }
   376      *ppOut = (Fts5Tokenizer*)p;
   377    }
   378    return rc;
   379  }
   380  
   381  /*
   382  ** Return true if, for the purposes of tokenizing with the tokenizer
   383  ** passed as the first argument, codepoint iCode is considered a token 
   384  ** character (not a separator).
   385  */
   386  static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
   387    assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
   388    return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
   389  }
   390  
   391  static int fts5UnicodeTokenize(
   392    Fts5Tokenizer *pTokenizer,
   393    void *pCtx,
   394    int iUnused,
   395    const char *pText, int nText,
   396    int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
   397  ){
   398    Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
   399    int rc = SQLITE_OK;
   400    unsigned char *a = p->aTokenChar;
   401  
   402    unsigned char *zTerm = (unsigned char*)&pText[nText];
   403    unsigned char *zCsr = (unsigned char *)pText;
   404  
   405    /* Output buffer */
   406    char *aFold = p->aFold;
   407    int nFold = p->nFold;
   408    const char *pEnd = &aFold[nFold-6];
   409  
   410    UNUSED_PARAM(iUnused);
   411  
   412    /* Each iteration of this loop gobbles up a contiguous run of separators,
   413    ** then the next token.  */
   414    while( rc==SQLITE_OK ){
   415      int iCode;                    /* non-ASCII codepoint read from input */
   416      char *zOut = aFold;
   417      int is;
   418      int ie;
   419  
   420      /* Skip any separator characters. */
   421      while( 1 ){
   422        if( zCsr>=zTerm ) goto tokenize_done;
   423        if( *zCsr & 0x80 ) {
   424          /* A character outside of the ascii range. Skip past it if it is
   425          ** a separator character. Or break out of the loop if it is not. */
   426          is = zCsr - (unsigned char*)pText;
   427          READ_UTF8(zCsr, zTerm, iCode);
   428          if( fts5UnicodeIsAlnum(p, iCode) ){
   429            goto non_ascii_tokenchar;
   430          }
   431        }else{
   432          if( a[*zCsr] ){
   433            is = zCsr - (unsigned char*)pText;
   434            goto ascii_tokenchar;
   435          }
   436          zCsr++;
   437        }
   438      }
   439  
   440      /* Run through the tokenchars. Fold them into the output buffer along
   441      ** the way.  */
   442      while( zCsr<zTerm ){
   443  
   444        /* Grow the output buffer so that there is sufficient space to fit the
   445        ** largest possible utf-8 character.  */
   446        if( zOut>pEnd ){
   447          aFold = sqlite3_malloc(nFold*2);
   448          if( aFold==0 ){
   449            rc = SQLITE_NOMEM;
   450            goto tokenize_done;
   451          }
   452          zOut = &aFold[zOut - p->aFold];
   453          memcpy(aFold, p->aFold, nFold);
   454          sqlite3_free(p->aFold);
   455          p->aFold = aFold;
   456          p->nFold = nFold = nFold*2;
   457          pEnd = &aFold[nFold-6];
   458        }
   459  
   460        if( *zCsr & 0x80 ){
   461          /* An non-ascii-range character. Fold it into the output buffer if
   462          ** it is a token character, or break out of the loop if it is not. */
   463          READ_UTF8(zCsr, zTerm, iCode);
   464          if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
   465   non_ascii_tokenchar:
   466            iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
   467            if( iCode ) WRITE_UTF8(zOut, iCode);
   468          }else{
   469            break;
   470          }
   471        }else if( a[*zCsr]==0 ){
   472          /* An ascii-range separator character. End of token. */
   473          break; 
   474        }else{
   475   ascii_tokenchar:
   476          if( *zCsr>='A' && *zCsr<='Z' ){
   477            *zOut++ = *zCsr + 32;
   478          }else{
   479            *zOut++ = *zCsr;
   480          }
   481          zCsr++;
   482        }
   483        ie = zCsr - (unsigned char*)pText;
   484      }
   485  
   486      /* Invoke the token callback */
   487      rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); 
   488    }
   489    
   490   tokenize_done:
   491    if( rc==SQLITE_DONE ) rc = SQLITE_OK;
   492    return rc;
   493  }
   494  
   495  /**************************************************************************
   496  ** Start of porter stemmer implementation.
   497  */
   498  
   499  /* Any tokens larger than this (in bytes) are passed through without
   500  ** stemming. */
   501  #define FTS5_PORTER_MAX_TOKEN 64
   502  
   503  typedef struct PorterTokenizer PorterTokenizer;
   504  struct PorterTokenizer {
   505    fts5_tokenizer tokenizer;       /* Parent tokenizer module */
   506    Fts5Tokenizer *pTokenizer;      /* Parent tokenizer instance */
   507    char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
   508  };
   509  
   510  /*
   511  ** Delete a "porter" tokenizer.
   512  */
   513  static void fts5PorterDelete(Fts5Tokenizer *pTok){
   514    if( pTok ){
   515      PorterTokenizer *p = (PorterTokenizer*)pTok;
   516      if( p->pTokenizer ){
   517        p->tokenizer.xDelete(p->pTokenizer);
   518      }
   519      sqlite3_free(p);
   520    }
   521  }
   522  
   523  /*
   524  ** Create a "porter" tokenizer.
   525  */
   526  static int fts5PorterCreate(
   527    void *pCtx, 
   528    const char **azArg, int nArg,
   529    Fts5Tokenizer **ppOut
   530  ){
   531    fts5_api *pApi = (fts5_api*)pCtx;
   532    int rc = SQLITE_OK;
   533    PorterTokenizer *pRet;
   534    void *pUserdata = 0;
   535    const char *zBase = "unicode61";
   536  
   537    if( nArg>0 ){
   538      zBase = azArg[0];
   539    }
   540  
   541    pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
   542    if( pRet ){
   543      memset(pRet, 0, sizeof(PorterTokenizer));
   544      rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
   545    }else{
   546      rc = SQLITE_NOMEM;
   547    }
   548    if( rc==SQLITE_OK ){
   549      int nArg2 = (nArg>0 ? nArg-1 : 0);
   550      const char **azArg2 = (nArg2 ? &azArg[1] : 0);
   551      rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
   552    }
   553  
   554    if( rc!=SQLITE_OK ){
   555      fts5PorterDelete((Fts5Tokenizer*)pRet);
   556      pRet = 0;
   557    }
   558    *ppOut = (Fts5Tokenizer*)pRet;
   559    return rc;
   560  }
   561  
   562  typedef struct PorterContext PorterContext;
   563  struct PorterContext {
   564    void *pCtx;
   565    int (*xToken)(void*, int, const char*, int, int, int);
   566    char *aBuf;
   567  };
   568  
   569  typedef struct PorterRule PorterRule;
   570  struct PorterRule {
   571    const char *zSuffix;
   572    int nSuffix;
   573    int (*xCond)(char *zStem, int nStem);
   574    const char *zOutput;
   575    int nOutput;
   576  };
   577  
   578  #if 0
   579  static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
   580    int ret = -1;
   581    int nBuf = *pnBuf;
   582    PorterRule *p;
   583  
   584    for(p=aRule; p->zSuffix; p++){
   585      assert( strlen(p->zSuffix)==p->nSuffix );
   586      assert( strlen(p->zOutput)==p->nOutput );
   587      if( nBuf<p->nSuffix ) continue;
   588      if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
   589    }
   590  
   591    if( p->zSuffix ){
   592      int nStem = nBuf - p->nSuffix;
   593      if( p->xCond==0 || p->xCond(aBuf, nStem) ){
   594        memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
   595        *pnBuf = nStem + p->nOutput;
   596        ret = p - aRule;
   597      }
   598    }
   599  
   600    return ret;
   601  }
   602  #endif
   603  
   604  static int fts5PorterIsVowel(char c, int bYIsVowel){
   605    return (
   606        c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
   607    );
   608  }
   609  
   610  static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
   611    int i;
   612    int bCons = bPrevCons;
   613  
   614    /* Scan for a vowel */
   615    for(i=0; i<nStem; i++){
   616      if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
   617    }
   618  
   619    /* Scan for a consonent */
   620    for(i++; i<nStem; i++){
   621      if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
   622    }
   623    return 0;
   624  }
   625  
   626  /* porter rule condition: (m > 0) */
   627  static int fts5Porter_MGt0(char *zStem, int nStem){
   628    return !!fts5PorterGobbleVC(zStem, nStem, 0);
   629  }
   630  
   631  /* porter rule condition: (m > 1) */
   632  static int fts5Porter_MGt1(char *zStem, int nStem){
   633    int n;
   634    n = fts5PorterGobbleVC(zStem, nStem, 0);
   635    if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
   636      return 1;
   637    }
   638    return 0;
   639  }
   640  
   641  /* porter rule condition: (m = 1) */
   642  static int fts5Porter_MEq1(char *zStem, int nStem){
   643    int n;
   644    n = fts5PorterGobbleVC(zStem, nStem, 0);
   645    if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
   646      return 1;
   647    }
   648    return 0;
   649  }
   650  
   651  /* porter rule condition: (*o) */
   652  static int fts5Porter_Ostar(char *zStem, int nStem){
   653    if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
   654      return 0;
   655    }else{
   656      int i;
   657      int mask = 0;
   658      int bCons = 0;
   659      for(i=0; i<nStem; i++){
   660        bCons = !fts5PorterIsVowel(zStem[i], bCons);
   661        assert( bCons==0 || bCons==1 );
   662        mask = (mask << 1) + bCons;
   663      }
   664      return ((mask & 0x0007)==0x0005);
   665    }
   666  }
   667  
   668  /* porter rule condition: (m > 1 and (*S or *T)) */
   669  static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
   670    assert( nStem>0 );
   671    return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t') 
   672        && fts5Porter_MGt1(zStem, nStem);
   673  }
   674  
   675  /* porter rule condition: (*v*) */
   676  static int fts5Porter_Vowel(char *zStem, int nStem){
   677    int i;
   678    for(i=0; i<nStem; i++){
   679      if( fts5PorterIsVowel(zStem[i], i>0) ){
   680        return 1;
   681      }
   682    }
   683    return 0;
   684  }
   685  
   686  
   687  /**************************************************************************
   688  ***************************************************************************
   689  ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
   690  */
   691  
   692  static int fts5PorterStep4(char *aBuf, int *pnBuf){
   693    int ret = 0;
   694    int nBuf = *pnBuf;
   695    switch( aBuf[nBuf-2] ){
   696      
   697      case 'a': 
   698        if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
   699          if( fts5Porter_MGt1(aBuf, nBuf-2) ){
   700            *pnBuf = nBuf - 2;
   701          }
   702        }
   703        break;
   704    
   705      case 'c': 
   706        if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
   707          if( fts5Porter_MGt1(aBuf, nBuf-4) ){
   708            *pnBuf = nBuf - 4;
   709          }
   710        }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
   711          if( fts5Porter_MGt1(aBuf, nBuf-4) ){
   712            *pnBuf = nBuf - 4;
   713          }
   714        }
   715        break;
   716    
   717      case 'e': 
   718        if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
   719          if( fts5Porter_MGt1(aBuf, nBuf-2) ){
   720            *pnBuf = nBuf - 2;
   721          }
   722        }
   723        break;
   724    
   725      case 'i': 
   726        if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
   727          if( fts5Porter_MGt1(aBuf, nBuf-2) ){
   728            *pnBuf = nBuf - 2;
   729          }
   730        }
   731        break;
   732    
   733      case 'l': 
   734        if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
   735          if( fts5Porter_MGt1(aBuf, nBuf-4) ){
   736            *pnBuf = nBuf - 4;
   737          }
   738        }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
   739          if( fts5Porter_MGt1(aBuf, nBuf-4) ){
   740            *pnBuf = nBuf - 4;
   741          }
   742        }
   743        break;
   744    
   745      case 'n': 
   746        if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
   747          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   748            *pnBuf = nBuf - 3;
   749          }
   750        }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
   751          if( fts5Porter_MGt1(aBuf, nBuf-5) ){
   752            *pnBuf = nBuf - 5;
   753          }
   754        }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
   755          if( fts5Porter_MGt1(aBuf, nBuf-4) ){
   756            *pnBuf = nBuf - 4;
   757          }
   758        }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
   759          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   760            *pnBuf = nBuf - 3;
   761          }
   762        }
   763        break;
   764    
   765      case 'o': 
   766        if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
   767          if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
   768            *pnBuf = nBuf - 3;
   769          }
   770        }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
   771          if( fts5Porter_MGt1(aBuf, nBuf-2) ){
   772            *pnBuf = nBuf - 2;
   773          }
   774        }
   775        break;
   776    
   777      case 's': 
   778        if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
   779          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   780            *pnBuf = nBuf - 3;
   781          }
   782        }
   783        break;
   784    
   785      case 't': 
   786        if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
   787          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   788            *pnBuf = nBuf - 3;
   789          }
   790        }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
   791          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   792            *pnBuf = nBuf - 3;
   793          }
   794        }
   795        break;
   796    
   797      case 'u': 
   798        if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
   799          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   800            *pnBuf = nBuf - 3;
   801          }
   802        }
   803        break;
   804    
   805      case 'v': 
   806        if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
   807          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   808            *pnBuf = nBuf - 3;
   809          }
   810        }
   811        break;
   812    
   813      case 'z': 
   814        if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
   815          if( fts5Porter_MGt1(aBuf, nBuf-3) ){
   816            *pnBuf = nBuf - 3;
   817          }
   818        }
   819        break;
   820    
   821    }
   822    return ret;
   823  }
   824    
   825  
   826  static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
   827    int ret = 0;
   828    int nBuf = *pnBuf;
   829    switch( aBuf[nBuf-2] ){
   830      
   831      case 'a': 
   832        if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
   833          memcpy(&aBuf[nBuf-2], "ate", 3);
   834          *pnBuf = nBuf - 2 + 3;
   835          ret = 1;
   836        }
   837        break;
   838    
   839      case 'b': 
   840        if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
   841          memcpy(&aBuf[nBuf-2], "ble", 3);
   842          *pnBuf = nBuf - 2 + 3;
   843          ret = 1;
   844        }
   845        break;
   846    
   847      case 'i': 
   848        if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
   849          memcpy(&aBuf[nBuf-2], "ize", 3);
   850          *pnBuf = nBuf - 2 + 3;
   851          ret = 1;
   852        }
   853        break;
   854    
   855    }
   856    return ret;
   857  }
   858    
   859  
   860  static int fts5PorterStep2(char *aBuf, int *pnBuf){
   861    int ret = 0;
   862    int nBuf = *pnBuf;
   863    switch( aBuf[nBuf-2] ){
   864      
   865      case 'a': 
   866        if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
   867          if( fts5Porter_MGt0(aBuf, nBuf-7) ){
   868            memcpy(&aBuf[nBuf-7], "ate", 3);
   869            *pnBuf = nBuf - 7 + 3;
   870          }
   871        }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
   872          if( fts5Porter_MGt0(aBuf, nBuf-6) ){
   873            memcpy(&aBuf[nBuf-6], "tion", 4);
   874            *pnBuf = nBuf - 6 + 4;
   875          }
   876        }
   877        break;
   878    
   879      case 'c': 
   880        if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
   881          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
   882            memcpy(&aBuf[nBuf-4], "ence", 4);
   883            *pnBuf = nBuf - 4 + 4;
   884          }
   885        }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
   886          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
   887            memcpy(&aBuf[nBuf-4], "ance", 4);
   888            *pnBuf = nBuf - 4 + 4;
   889          }
   890        }
   891        break;
   892    
   893      case 'e': 
   894        if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
   895          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
   896            memcpy(&aBuf[nBuf-4], "ize", 3);
   897            *pnBuf = nBuf - 4 + 3;
   898          }
   899        }
   900        break;
   901    
   902      case 'g': 
   903        if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
   904          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
   905            memcpy(&aBuf[nBuf-4], "log", 3);
   906            *pnBuf = nBuf - 4 + 3;
   907          }
   908        }
   909        break;
   910    
   911      case 'l': 
   912        if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
   913          if( fts5Porter_MGt0(aBuf, nBuf-3) ){
   914            memcpy(&aBuf[nBuf-3], "ble", 3);
   915            *pnBuf = nBuf - 3 + 3;
   916          }
   917        }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
   918          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
   919            memcpy(&aBuf[nBuf-4], "al", 2);
   920            *pnBuf = nBuf - 4 + 2;
   921          }
   922        }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
   923          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
   924            memcpy(&aBuf[nBuf-5], "ent", 3);
   925            *pnBuf = nBuf - 5 + 3;
   926          }
   927        }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
   928          if( fts5Porter_MGt0(aBuf, nBuf-3) ){
   929            memcpy(&aBuf[nBuf-3], "e", 1);
   930            *pnBuf = nBuf - 3 + 1;
   931          }
   932        }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
   933          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
   934            memcpy(&aBuf[nBuf-5], "ous", 3);
   935            *pnBuf = nBuf - 5 + 3;
   936          }
   937        }
   938        break;
   939    
   940      case 'o': 
   941        if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
   942          if( fts5Porter_MGt0(aBuf, nBuf-7) ){
   943            memcpy(&aBuf[nBuf-7], "ize", 3);
   944            *pnBuf = nBuf - 7 + 3;
   945          }
   946        }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
   947          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
   948            memcpy(&aBuf[nBuf-5], "ate", 3);
   949            *pnBuf = nBuf - 5 + 3;
   950          }
   951        }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
   952          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
   953            memcpy(&aBuf[nBuf-4], "ate", 3);
   954            *pnBuf = nBuf - 4 + 3;
   955          }
   956        }
   957        break;
   958    
   959      case 's': 
   960        if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
   961          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
   962            memcpy(&aBuf[nBuf-5], "al", 2);
   963            *pnBuf = nBuf - 5 + 2;
   964          }
   965        }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
   966          if( fts5Porter_MGt0(aBuf, nBuf-7) ){
   967            memcpy(&aBuf[nBuf-7], "ive", 3);
   968            *pnBuf = nBuf - 7 + 3;
   969          }
   970        }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
   971          if( fts5Porter_MGt0(aBuf, nBuf-7) ){
   972            memcpy(&aBuf[nBuf-7], "ful", 3);
   973            *pnBuf = nBuf - 7 + 3;
   974          }
   975        }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
   976          if( fts5Porter_MGt0(aBuf, nBuf-7) ){
   977            memcpy(&aBuf[nBuf-7], "ous", 3);
   978            *pnBuf = nBuf - 7 + 3;
   979          }
   980        }
   981        break;
   982    
   983      case 't': 
   984        if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
   985          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
   986            memcpy(&aBuf[nBuf-5], "al", 2);
   987            *pnBuf = nBuf - 5 + 2;
   988          }
   989        }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
   990          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
   991            memcpy(&aBuf[nBuf-5], "ive", 3);
   992            *pnBuf = nBuf - 5 + 3;
   993          }
   994        }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
   995          if( fts5Porter_MGt0(aBuf, nBuf-6) ){
   996            memcpy(&aBuf[nBuf-6], "ble", 3);
   997            *pnBuf = nBuf - 6 + 3;
   998          }
   999        }
  1000        break;
  1001    
  1002    }
  1003    return ret;
  1004  }
  1005    
  1006  
  1007  static int fts5PorterStep3(char *aBuf, int *pnBuf){
  1008    int ret = 0;
  1009    int nBuf = *pnBuf;
  1010    switch( aBuf[nBuf-2] ){
  1011      
  1012      case 'a': 
  1013        if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
  1014          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  1015            memcpy(&aBuf[nBuf-4], "ic", 2);
  1016            *pnBuf = nBuf - 4 + 2;
  1017          }
  1018        }
  1019        break;
  1020    
  1021      case 's': 
  1022        if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
  1023          if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  1024            *pnBuf = nBuf - 4;
  1025          }
  1026        }
  1027        break;
  1028    
  1029      case 't': 
  1030        if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
  1031          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  1032            memcpy(&aBuf[nBuf-5], "ic", 2);
  1033            *pnBuf = nBuf - 5 + 2;
  1034          }
  1035        }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
  1036          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  1037            memcpy(&aBuf[nBuf-5], "ic", 2);
  1038            *pnBuf = nBuf - 5 + 2;
  1039          }
  1040        }
  1041        break;
  1042    
  1043      case 'u': 
  1044        if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
  1045          if( fts5Porter_MGt0(aBuf, nBuf-3) ){
  1046            *pnBuf = nBuf - 3;
  1047          }
  1048        }
  1049        break;
  1050    
  1051      case 'v': 
  1052        if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
  1053          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  1054            *pnBuf = nBuf - 5;
  1055          }
  1056        }
  1057        break;
  1058    
  1059      case 'z': 
  1060        if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
  1061          if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  1062            memcpy(&aBuf[nBuf-5], "al", 2);
  1063            *pnBuf = nBuf - 5 + 2;
  1064          }
  1065        }
  1066        break;
  1067    
  1068    }
  1069    return ret;
  1070  }
  1071    
  1072  
  1073  static int fts5PorterStep1B(char *aBuf, int *pnBuf){
  1074    int ret = 0;
  1075    int nBuf = *pnBuf;
  1076    switch( aBuf[nBuf-2] ){
  1077      
  1078      case 'e': 
  1079        if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
  1080          if( fts5Porter_MGt0(aBuf, nBuf-3) ){
  1081            memcpy(&aBuf[nBuf-3], "ee", 2);
  1082            *pnBuf = nBuf - 3 + 2;
  1083          }
  1084        }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
  1085          if( fts5Porter_Vowel(aBuf, nBuf-2) ){
  1086            *pnBuf = nBuf - 2;
  1087            ret = 1;
  1088          }
  1089        }
  1090        break;
  1091    
  1092      case 'n': 
  1093        if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
  1094          if( fts5Porter_Vowel(aBuf, nBuf-3) ){
  1095            *pnBuf = nBuf - 3;
  1096            ret = 1;
  1097          }
  1098        }
  1099        break;
  1100    
  1101    }
  1102    return ret;
  1103  }
  1104    
  1105  /* 
  1106  ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
  1107  ***************************************************************************
  1108  **************************************************************************/
  1109  
  1110  static void fts5PorterStep1A(char *aBuf, int *pnBuf){
  1111    int nBuf = *pnBuf;
  1112    if( aBuf[nBuf-1]=='s' ){
  1113      if( aBuf[nBuf-2]=='e' ){
  1114        if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s') 
  1115         || (nBuf>3 && aBuf[nBuf-3]=='i' )
  1116        ){
  1117          *pnBuf = nBuf-2;
  1118        }else{
  1119          *pnBuf = nBuf-1;
  1120        }
  1121      }
  1122      else if( aBuf[nBuf-2]!='s' ){
  1123        *pnBuf = nBuf-1;
  1124      }
  1125    }
  1126  }
  1127  
  1128  static int fts5PorterCb(
  1129    void *pCtx, 
  1130    int tflags,
  1131    const char *pToken, 
  1132    int nToken, 
  1133    int iStart, 
  1134    int iEnd
  1135  ){
  1136    PorterContext *p = (PorterContext*)pCtx;
  1137  
  1138    char *aBuf;
  1139    int nBuf;
  1140  
  1141    if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
  1142    aBuf = p->aBuf;
  1143    nBuf = nToken;
  1144    memcpy(aBuf, pToken, nBuf);
  1145  
  1146    /* Step 1. */
  1147    fts5PorterStep1A(aBuf, &nBuf);
  1148    if( fts5PorterStep1B(aBuf, &nBuf) ){
  1149      if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
  1150        char c = aBuf[nBuf-1];
  1151        if( fts5PorterIsVowel(c, 0)==0 
  1152         && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2] 
  1153        ){
  1154          nBuf--;
  1155        }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
  1156          aBuf[nBuf++] = 'e';
  1157        }
  1158      }
  1159    }
  1160  
  1161    /* Step 1C. */
  1162    if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
  1163      aBuf[nBuf-1] = 'i';
  1164    }
  1165  
  1166    /* Steps 2 through 4. */
  1167    fts5PorterStep2(aBuf, &nBuf);
  1168    fts5PorterStep3(aBuf, &nBuf);
  1169    fts5PorterStep4(aBuf, &nBuf);
  1170  
  1171    /* Step 5a. */
  1172    assert( nBuf>0 );
  1173    if( aBuf[nBuf-1]=='e' ){
  1174      if( fts5Porter_MGt1(aBuf, nBuf-1) 
  1175       || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
  1176      ){
  1177        nBuf--;
  1178      }
  1179    }
  1180  
  1181    /* Step 5b. */
  1182    if( nBuf>1 && aBuf[nBuf-1]=='l' 
  1183     && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) 
  1184    ){
  1185      nBuf--;
  1186    }
  1187  
  1188    return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
  1189  
  1190   pass_through:
  1191    return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
  1192  }
  1193  
  1194  /*
  1195  ** Tokenize using the porter tokenizer.
  1196  */
  1197  static int fts5PorterTokenize(
  1198    Fts5Tokenizer *pTokenizer,
  1199    void *pCtx,
  1200    int flags,
  1201    const char *pText, int nText,
  1202    int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
  1203  ){
  1204    PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
  1205    PorterContext sCtx;
  1206    sCtx.xToken = xToken;
  1207    sCtx.pCtx = pCtx;
  1208    sCtx.aBuf = p->aBuf;
  1209    return p->tokenizer.xTokenize(
  1210        p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
  1211    );
  1212  }
  1213  
  1214  /*
  1215  ** Register all built-in tokenizers with FTS5.
  1216  */
  1217  int sqlite3Fts5TokenizerInit(fts5_api *pApi){
  1218    struct BuiltinTokenizer {
  1219      const char *zName;
  1220      fts5_tokenizer x;
  1221    } aBuiltin[] = {
  1222      { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
  1223      { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
  1224      { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
  1225    };
  1226    
  1227    int rc = SQLITE_OK;             /* Return code */
  1228    int i;                          /* To iterate through builtin functions */
  1229  
  1230    for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
  1231      rc = pApi->xCreateTokenizer(pApi,
  1232          aBuiltin[i].zName,
  1233          (void*)pApi,
  1234          &aBuiltin[i].x,
  1235          0
  1236      );
  1237    }
  1238  
  1239    return rc;
  1240  }
  1241  
  1242