modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts1/simple_tokenizer.c (about)

     1  /*
     2  ** The author disclaims copyright to this source code.
     3  **
     4  *************************************************************************
     5  ** Implementation of the "simple" full-text-search tokenizer.
     6  */
     7  
     8  #include <assert.h>
     9  #if !defined(__APPLE__)
    10  #include <malloc.h>
    11  #else
    12  #include <stdlib.h>
    13  #endif
    14  #include <stdio.h>
    15  #include <string.h>
    16  #include <ctype.h>
    17  
    18  #include "tokenizer.h"
    19  
    20  /* Duplicate a string; the caller must free() the returned string.
    21   * (We don't use strdup() since it's not part of the standard C library and
    22   * may not be available everywhere.) */
    23  /* TODO(shess) Copied from fulltext.c, consider util.c for such
    24  ** things. */
    25  static char *string_dup(const char *s){
    26    char *str = malloc(strlen(s) + 1);
    27    strcpy(str, s);
    28    return str;
    29  }
    30  
    31  typedef struct simple_tokenizer {
    32    sqlite3_tokenizer base;
    33    const char *zDelim;          /* token delimiters */
    34  } simple_tokenizer;
    35  
    36  typedef struct simple_tokenizer_cursor {
    37    sqlite3_tokenizer_cursor base;
    38    const char *pInput;          /* input we are tokenizing */
    39    int nBytes;                  /* size of the input */
    40    const char *pCurrent;        /* current position in pInput */
    41    int iToken;                  /* index of next token to be returned */
    42    char *zToken;                /* storage for current token */
    43    int nTokenBytes;             /* actual size of current token */
    44    int nTokenAllocated;         /* space allocated to zToken buffer */
    45  } simple_tokenizer_cursor;
    46  
    47  static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
    48  
    49  static int simpleCreate(
    50    int argc, const char **argv,
    51    sqlite3_tokenizer **ppTokenizer
    52  ){
    53    simple_tokenizer *t;
    54  
    55    t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
    56    /* TODO(shess) Delimiters need to remain the same from run to run,
    57    ** else we need to reindex.  One solution would be a meta-table to
    58    ** track such information in the database, then we'd only want this
    59    ** information on the initial create.
    60    */
    61    if( argc>1 ){
    62      t->zDelim = string_dup(argv[1]);
    63    } else {
    64      /* Build a string excluding alphanumeric ASCII characters */
    65      char zDelim[0x80];               /* nul-terminated, so nul not a member */
    66      int i, j;
    67      for(i=1, j=0; i<0x80; i++){
    68        if( !isalnum(i) ){
    69          zDelim[j++] = i;
    70        }
    71      }
    72      zDelim[j++] = '\0';
    73      assert( j<=sizeof(zDelim) );
    74      t->zDelim = string_dup(zDelim);
    75    }
    76  
    77    *ppTokenizer = &t->base;
    78    return SQLITE_OK;
    79  }
    80  
    81  static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
    82    simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
    83  
    84    free((void *) t->zDelim);
    85    free(t);
    86  
    87    return SQLITE_OK;
    88  }
    89  
    90  static int simpleOpen(
    91    sqlite3_tokenizer *pTokenizer,
    92    const char *pInput, int nBytes,
    93    sqlite3_tokenizer_cursor **ppCursor
    94  ){
    95    simple_tokenizer_cursor *c;
    96  
    97    c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
    98    c->pInput = pInput;
    99    c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
   100    c->pCurrent = c->pInput;        /* start tokenizing at the beginning */
   101    c->iToken = 0;
   102    c->zToken = NULL;               /* no space allocated, yet. */
   103    c->nTokenBytes = 0;
   104    c->nTokenAllocated = 0;
   105  
   106    *ppCursor = &c->base;
   107    return SQLITE_OK;
   108  }
   109  
   110  static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
   111    simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
   112  
   113    if( NULL!=c->zToken ){
   114      free(c->zToken);
   115    }
   116    free(c);
   117  
   118    return SQLITE_OK;
   119  }
   120  
   121  static int simpleNext(
   122    sqlite3_tokenizer_cursor *pCursor,
   123    const char **ppToken, int *pnBytes,
   124    int *piStartOffset, int *piEndOffset, int *piPosition
   125  ){
   126    simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
   127    simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
   128    int ii;
   129  
   130    while( c->pCurrent-c->pInput<c->nBytes ){
   131      int n = (int) strcspn(c->pCurrent, t->zDelim);
   132      if( n>0 ){
   133        if( n+1>c->nTokenAllocated ){
   134          c->zToken = realloc(c->zToken, n+1);
   135        }
   136        for(ii=0; ii<n; ii++){
   137          /* TODO(shess) This needs expansion to handle UTF-8
   138          ** case-insensitivity.
   139          */
   140          char ch = c->pCurrent[ii];
   141          c->zToken[ii] = (unsigned char)ch<0x80 ? tolower((unsigned char)ch):ch;
   142        }
   143        c->zToken[n] = '\0';
   144        *ppToken = c->zToken;
   145        *pnBytes = n;
   146        *piStartOffset = (int) (c->pCurrent-c->pInput);
   147        *piEndOffset = *piStartOffset+n;
   148        *piPosition = c->iToken++;
   149        c->pCurrent += n + 1;
   150  
   151        return SQLITE_OK;
   152      }
   153      c->pCurrent += n + 1;
   154      /* TODO(shess) could strspn() to skip delimiters en masse.  Needs
   155      ** to happen in two places, though, which is annoying.
   156      */
   157    }
   158    return SQLITE_DONE;
   159  }
   160  
   161  static sqlite3_tokenizer_module simpleTokenizerModule = {
   162    0,
   163    simpleCreate,
   164    simpleDestroy,
   165    simpleOpen,
   166    simpleClose,
   167    simpleNext,
   168  };
   169  
   170  void get_simple_tokenizer_module(
   171    sqlite3_tokenizer_module **ppModule
   172  ){
   173    *ppModule = &simpleTokenizerModule;
   174  }