modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/test/fts5tokenizer.test (about)

     1  # 2014 Dec 20
     2  #
     3  # The author disclaims copyright to this source code.  In place of
     4  # a legal notice, here is a blessing:
     5  #
     6  #    May you do good and not evil.
     7  #    May you find forgiveness for yourself and forgive others.
     8  #    May you share freely, never taking more than you give.
     9  #
    10  #***********************************************************************
    11  #
    12  # Tests focusing on the built-in fts5 tokenizers. 
    13  #
    14  
    15  source [file join [file dirname [info script]] fts5_common.tcl]
    16  set testprefix fts5tokenizer
    17  
    18  # If SQLITE_ENABLE_FTS5 is defined, omit this file.
    19  ifcapable !fts5 {
    20    finish_test
    21    return
    22  }
    23  
    24  
    25  do_execsql_test 1.0 {
    26    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
    27    DROP TABLE ft1;
    28  }
    29  do_execsql_test 1.1 {
    30    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter');
    31    DROP TABLE ft1;
    32  }
    33  do_execsql_test 1.2 {
    34    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter);
    35    DROP TABLE ft1;
    36  }
    37  do_execsql_test 1.3 {
    38    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
    39    DROP TABLE ft1;
    40  }
    41  do_execsql_test 1.4 {
    42    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
    43    DROP TABLE ft1;
    44  }
    45  
    46  do_catchsql_test 1.5 {
    47    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch');
    48  } {1 {no such tokenizer: nosuch}}
    49  
    50  do_catchsql_test 1.6 {
    51    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch');
    52  } {1 {error in tokenizer constructor}}
    53  
    54  do_execsql_test 2.0 {
    55    CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
    56    INSERT INTO ft1 VALUES('embedded databases');
    57  }
    58  do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
    59  do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
    60  do_execsql_test 2.3 { 
    61    SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' 
    62  } 1
    63  
    64  proc tcl_create {args} { 
    65    set ::targs $args
    66    error "failed" 
    67  }
    68  sqlite3_fts5_create_tokenizer db tcl tcl_create
    69  
    70  foreach {tn directive expected} {
    71    1 {tokenize='tcl a b c'}             {a b c}
    72    2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
    73    3 {tokenize="tcl 'g' 'h' 'i'"}       {g h i}
    74    4 {tokenize = tcl}                   {}
    75  } {
    76    do_catchsql_test 3.$tn.1 "
    77      CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
    78    " {1 {error in tokenizer constructor}}
    79    do_test 3.$tn.2 { set ::targs } $expected
    80  }
    81  
    82  do_catchsql_test 4.1 {
    83    CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
    84  } {1 {parse error in "tokenize = tcl abc"}}
    85  do_catchsql_test 4.2 {
    86    CREATE VIRTUAL TABLE ft2 USING fts5(x y)
    87  } {1 {unrecognized column option: y}}
    88  
    89  #-------------------------------------------------------------------------
    90  # Test the "separators" and "tokenchars" options a bit.
    91  #
    92  foreach {tn tokenizer} {1 ascii 2 unicode61} {
    93    reset_db
    94    set T "$tokenizer tokenchars ',.:' separators 'xyz'"
    95    execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
    96    do_execsql_test 5.$tn.1 {
    97      INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
    98    }
    99    foreach {tn2 token res} {
   100      1 abc 1     2 def 1     3 ghi 1    4 jkl {}
   101      5 mno {}    6 pqr {}    7 stu {}   8 jkl.mno,pqr:stu 1
   102      9 vw  1
   103    } {
   104      do_execsql_test 5.$tn.2.$tn2 "
   105        SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
   106      " $res
   107    }
   108  }
   109  
   110  #-------------------------------------------------------------------------
   111  # Miscellaneous tests for the ascii tokenizer.
   112  #
   113  # 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the
   114  #        'separators' option. But unicode61 does not.
   115  #
   116  # 5.2.*: An option without an argument is an error.
   117  #
   118  
   119  do_test 5.1.1 {
   120    execsql "
   121      CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`);
   122      INSERT INTO a1 VALUES('abc\u1234def');
   123    "
   124    execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' } 
   125  } {}
   126  
   127  do_test 5.1.2 {
   128    execsql "
   129      CREATE VIRTUAL TABLE a2 USING fts5(
   130          x, tokenize=`unicode61 separators '\u1234'`);
   131      INSERT INTO a2 VALUES('abc\u1234def');
   132    "
   133    execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' } 
   134  } {1}
   135  
   136  do_catchsql_test 5.2 {
   137    CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars');
   138  } {1 {error in tokenizer constructor}}
   139  do_catchsql_test 5.3 {
   140    CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg');
   141  } {1 {error in tokenizer constructor}}
   142  
   143  #-------------------------------------------------------------------------
   144  # Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE 
   145  # correctly.
   146  #
   147  
   148  proc test_token_cb {varname token iStart iEnd} {
   149    upvar $varname var
   150    lappend var $token
   151    if {[llength $var]==3} { return "SQLITE_DONE" }
   152    return "SQLITE_OK"
   153  }
   154  
   155  proc tokenize {cmd} {
   156    set res [list]
   157    $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res]
   158    set res
   159  }
   160  sqlite3_fts5_create_function db tokenize tokenize
   161  
   162  do_execsql_test 6.0 {
   163    CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii);
   164    INSERT INTO x1 VALUES('q w e r t y');
   165    INSERT INTO x1 VALUES('y t r e w q');
   166    SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r';
   167  } {
   168    {q w e} {y t r}
   169  }
   170  
   171  do_execsql_test 6.1 {
   172    CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61);
   173    INSERT INTO x2 VALUES('q w e r t y');
   174    INSERT INTO x2 VALUES('y t r e w q');
   175    SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r';
   176  } {
   177    {q w e} {y t r}
   178  }
   179  
   180  
   181  #-------------------------------------------------------------------------
   182  # Miscellaneous tests for the unicode tokenizer.
   183  #
   184  do_catchsql_test 6.1 {
   185    CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
   186  } {1 {error in tokenizer constructor}}
   187  do_catchsql_test 6.2 {
   188    CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
   189  } {1 {error in tokenizer constructor}}
   190  do_catchsql_test 6.3 {
   191    CREATE VIRTUAL TABLE a3 USING fts5(
   192      x, y, tokenize = 'unicode61 remove_diacritics 2'
   193    );
   194  } {1 {error in tokenizer constructor}}
   195  do_catchsql_test 6.4 {
   196    CREATE VIRTUAL TABLE a3 USING fts5(
   197      x, y, tokenize = 'unicode61 remove_diacritics 10'
   198    );
   199  } {1 {error in tokenizer constructor}}
   200  
   201  #-------------------------------------------------------------------------
   202  # Porter tokenizer with very large tokens.
   203  #
   204  set a [string repeat a 100]
   205  set b [string repeat b 500]
   206  set c [string repeat c 1000]
   207  do_execsql_test 7.0 {
   208    CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter);
   209    INSERT INTO e5 VALUES($a || ' ' || $b);
   210    INSERT INTO e5 VALUES($b || ' ' || $c);
   211    INSERT INTO e5 VALUES($c || ' ' || $a);
   212  }
   213  
   214  do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
   215  do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
   216  do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }
   217  
   218  #-------------------------------------------------------------------------
   219  # Test the 'separators' option with the unicode61 tokenizer.
   220  #
   221  do_execsql_test 8.1 {
   222    BEGIN;
   223    CREATE VIRTUAL TABLE e6 USING fts5(x,
   224      tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
   225    );
   226    INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
   227    CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
   228    SELECT term FROM e7;
   229    ROLLBACK;
   230  } {
   231    brown dog fox jumped lazy over quick the
   232  }
   233  
   234  do_execsql_test 8.2 [subst {
   235    BEGIN;
   236    CREATE VIRTUAL TABLE e6 USING fts5(x,
   237      tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'"
   238    );
   239    INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' 
   240                       || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog'
   241    );
   242    INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09');
   243    CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
   244    SELECT term FROM e7;
   245    ROLLBACK;
   246  }] [subst {
   247    brown dog fox jumped lazy over quick the \u0E08 \u0E09
   248  }]
   249  
   250  # Test that the porter tokenizer correctly passes arguments through to
   251  # its parent tokenizer.
   252  do_execsql_test 8.3 {
   253    BEGIN;
   254    CREATE VIRTUAL TABLE e6 USING fts5(x,
   255      tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
   256    );
   257    INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
   258    CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
   259    SELECT term FROM e7;
   260    ROLLBACK;
   261  } {
   262    brown dog fox jump lazi over quick the
   263  }
   264  
   265  #-------------------------------------------------------------------------
   266  # Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer
   267  # implementation.
   268  #
   269  reset_db
   270  proc tcl_create {args} { return "tcl_tokenize" }
   271  sqlite3_fts5_create_tokenizer db tcl tcl_create
   272  set ::flags [list]
   273  proc tcl_tokenize {tflags text} {
   274    lappend ::flags $tflags
   275    foreach {w iStart iEnd} [fts5_tokenize_split $text] {
   276      sqlite3_fts5_token $w $iStart $iEnd
   277    }
   278  }
   279  
   280  do_execsql_test 9.1.1 {
   281    CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl);
   282    INSERT INTO t1 VALUES('abc');
   283    INSERT INTO t1 VALUES('xyz');
   284  } {}
   285  do_test 9.1.2 { set ::flags } {document document}
   286  
   287  set ::flags [list]
   288  do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc}
   289  do_test 9.2.2 { set ::flags } {query}
   290  
   291  set ::flags [list]
   292  do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc}
   293  do_test 9.3.2 { set ::flags } {prefixquery}
   294  
   295  set ::flags [list]
   296  do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {}
   297  do_test 9.4.2 { set ::flags } {prefixquery}
   298  
   299  set ::flags [list]
   300  do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {}
   301  do_test 9.5.2 { set ::flags } {query}
   302  
   303  
   304  finish_test