github.com/jdgcs/sqlite3@v1.12.1-0.20210908114423-bc5f96e4dd51/testdata/tcl/fts4unicode.test (about)

     1  # 2012 May 25
     2  #
     3  # The author disclaims copyright to this source code.  In place of
     4  # a legal notice, here is a blessing:
     5  #
     6  #    May you do good and not evil.
     7  #    May you find forgiveness for yourself and forgive others.
     8  #    May you share freely, never taking more than you give.
     9  #
    10  #*************************************************************************
    11  #
    12  # The tests in this file focus on testing the "unicode" FTS tokenizer.
    13  #
    14  
    15  set testdir [file dirname $argv0]
    16  source $testdir/tester.tcl
    17  ifcapable !fts3_unicode { finish_test ; return }
    18  set ::testprefix fts4unicode
    19  
    20  proc do_unicode_token_test {tn input res} {
    21    set input [string map {' ''} $input]
    22    uplevel [list do_execsql_test $tn "
    23      SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
    24    " [list [list {*}$res]]]
    25  }
    26  
    27  proc do_unicode_token_test2 {tn input res} {
    28    set input [string map {' ''} $input]
    29    uplevel [list do_execsql_test $tn "
    30      SELECT fts3_tokenizer_test('unicode61', '$input');
    31    " [list [list {*}$res]]]
    32  }
    33  
    34  proc do_unicode_token_test3 {tn args} {
    35    set res   [lindex $args end]
    36    set sql "SELECT fts3_tokenizer_test('unicode61'"
    37    foreach a [lrange $args 0 end-1] {
    38      append sql ", '"
    39      append sql [string map {' ''} $a]
    40      append sql "'"
    41    }
    42    append sql ")"
    43    uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
    44  }
    45  
    46  do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
    47  
    48  do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
    49      "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"
    50  
    51  do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
    52      "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"
    53  
    54  # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
    55  do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
    56  do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"
    57  
    58  do_unicode_token_test 1.5 "The quick brown fox" {
    59    0 the The 1 quick quick 2 brown brown 3 fox fox
    60  }
    61  do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
    62    0 the The 1 quick quick 2 brown brown 3 fox fox
    63  }
    64  
    65  do_unicode_token_test2 1.7  {a B c D} {0 a a 1 b B 2 c c 3 d D}
    66  do_unicode_token_test2 1.8  "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"
    67  
    68  do_unicode_token_test2 1.9  "x\uC4x x\uD6x x\uDCx" \
    69      "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"
    70  
    71  # Check that diacritics are removed if remove_diacritics=1 is specified.
    72  # And that they do not break tokens.
    73  do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
    74  
    75  # Title-case mappings work
    76  do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"
    77  
    78  #-------------------------------------------------------------------------
    79  #
    80  set docs [list {
    81    Enhance the INSERT syntax to allow multiple rows to be inserted via the
    82    VALUES clause.
    83  } {
    84    Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
    85  } {
    86    Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
    87  } {
    88    Added the sqlite3_db_readonly() interface.
    89  } {
    90    Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
    91    ability to add new PRAGMA statements or to override built-in PRAGMAs.  
    92  } {
    93    Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
    94    the same row that contains the maximum x value.
    95  } {
    96    Added support for the FTS4 languageid option.
    97  } {
    98    Documented support for the FTS4 content option. This feature has actually
    99    been in the code since version 3.7.9 but is only now considered to be
   100    officially supported.  
   101  } {
   102    Pending statements no longer block ROLLBACK. Instead, the pending statement
   103    will return SQLITE_ABORT upon next access after the ROLLBACK.  
   104  } {
   105    Improvements to the handling of CSV inputs in the command-line shell
   106  } {
   107    Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
   108    incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
   109    connected by OR.  
   110  }]
   111  
   112  set map(a) [list "\u00C4" "\u00E4"]  ; # LATIN LETTER A WITH DIAERESIS
   113  set map(e) [list "\u00CB" "\u00EB"]  ; # LATIN LETTER E WITH DIAERESIS
   114  set map(i) [list "\u00CF" "\u00EF"]  ; # LATIN LETTER I WITH DIAERESIS
   115  set map(o) [list "\u00D6" "\u00F6"]  ; # LATIN LETTER O WITH DIAERESIS
   116  set map(u) [list "\u00DC" "\u00FC"]  ; # LATIN LETTER U WITH DIAERESIS
   117  set map(y) [list "\u0178" "\u00FF"]  ; # LATIN LETTER Y WITH DIAERESIS
   118  set map(h) [list "\u1E26" "\u1E27"]  ; # LATIN LETTER H WITH DIAERESIS
   119  set map(w) [list "\u1E84" "\u1E85"]  ; # LATIN LETTER W WITH DIAERESIS
   120  set map(x) [list "\u1E8C" "\u1E8D"]  ; # LATIN LETTER X WITH DIAERESIS
   121  foreach k [array names map] {
   122    lappend mappings [string toupper $k] [lindex $map($k) 0] 
   123    lappend mappings $k [lindex $map($k) 1]
   124  }
   125  proc mapdoc {doc} { 
   126    set doc [regsub -all {[[:space:]]+} $doc " "]
   127    string map $::mappings [string trim $doc] 
   128  }
   129  
   130  do_test 2.0 {
   131    execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
   132    foreach doc $docs {
   133      set d [mapdoc $doc]
   134      execsql { INSERT INTO t2 VALUES($d) }
   135    }
   136  } {}
   137  
   138  do_test 2.1 {
   139    set q [mapdoc "row"]
   140    execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
   141  } [list [mapdoc {
   142    Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
   143    the same row that contains the maximum x value.
   144  }]]
   145  
   146  foreach {tn query snippet} {
   147    2 "row" {
   148       ...returns the value of y on the same [row] that contains 
   149       the maximum x value.
   150    }
   151    3 "ROW" {
   152       ...returns the value of y on the same [row] that contains 
   153       the maximum x value.
   154    }
   155    4 "rollback" {
   156       ...[ROLLBACK]. Instead, the pending statement
   157       will return SQLITE_ABORT upon next access after the [ROLLBACK].
   158    }
   159    5 "rOllback" {
   160       ...[ROLLBACK]. Instead, the pending statement
   161       will return SQLITE_ABORT upon next access after the [ROLLBACK].
   162    }
   163    6 "lang*" {
   164       Added support for the FTS4 [languageid] option.
   165    }
   166  } {
   167    do_test 2.$tn {
   168      set q [mapdoc $query]
   169      execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
   170    } [list [mapdoc $snippet]]
   171  }
   172  
   173  #-------------------------------------------------------------------------
   174  # Make sure the unicode61 tokenizer does not crash if it is passed a 
   175  # NULL pointer.
   176  reset_db
   177  do_execsql_test 3.1 {
   178    CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
   179    INSERT INTO t1 VALUES(NULL, 'a b c');
   180  }
   181  
   182  do_execsql_test 3.2 {
   183    SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
   184  } {{a [b] c}}
   185  
   186  do_execsql_test 3.3 {
   187    BEGIN;
   188    DELETE FROM t1;
   189    INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
   190    INSERT INTO t1 SELECT * FROM t1;
   191    INSERT INTO t1 SELECT * FROM t1;
   192    INSERT INTO t1 SELECT * FROM t1;
   193    INSERT INTO t1 SELECT * FROM t1;
   194    INSERT INTO t1 SELECT * FROM t1;
   195    INSERT INTO t1 SELECT * FROM t1;
   196    INSERT INTO t1 SELECT * FROM t1;
   197    INSERT INTO t1 SELECT * FROM t1;
   198    INSERT INTO t1 SELECT * FROM t1;
   199    INSERT INTO t1 SELECT * FROM t1;
   200    INSERT INTO t1 SELECT * FROM t1;
   201    INSERT INTO t1 SELECT * FROM t1;
   202    INSERT INTO t1 SELECT * FROM t1;
   203    INSERT INTO t1 SELECT * FROM t1;
   204    INSERT INTO t1 SELECT * FROM t1;
   205    INSERT INTO t1 SELECT * FROM t1;
   206    INSERT INTO t1 VALUES('a b c', NULL);
   207    INSERT INTO t1 VALUES('a x c', NULL);
   208    COMMIT;
   209  }
   210  
   211  do_execsql_test 3.4 {
   212    SELECT * FROM t1 WHERE t1 MATCH 'a b';
   213  } {{a b c} {}}
   214  
   215  #-------------------------------------------------------------------------
   216  #
   217  reset_db
   218  
   219  do_test 4.1 {
   220    set a "abc\uFFFEdef"
   221    set b "abc\uD800def"
   222    set c "\uFFFEdef"
   223    set d "\uD800def"
   224    execsql {
   225      CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
   226      INSERT INTO t1 VALUES($a);
   227      INSERT INTO t1 VALUES($b);
   228      INSERT INTO t1 VALUES($c);
   229      INSERT INTO t1 VALUES($d);
   230    }
   231  } {}
   232  
   233  do_test 4.2 {
   234    set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
   235    set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
   236    set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
   237    set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
   238    execsql {
   239      INSERT INTO t1 VALUES($a);
   240      INSERT INTO t1 VALUES($b);
   241      INSERT INTO t1 VALUES($c);
   242      INSERT INTO t1 VALUES($d);
   243    }
   244  } {}
   245  
   246  do_test 4.3 {
   247    set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
   248    set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
   249    set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
   250    set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
   251    execsql {
   252      INSERT INTO t1 VALUES($a);
   253      INSERT INTO t1 VALUES($b);
   254      INSERT INTO t1 VALUES($c);
   255      INSERT INTO t1 VALUES($d);
   256    }
   257  } {}
   258  
   259  #-------------------------------------------------------------------------
   260  
   261  do_unicode_token_test3 5.1 {tokenchars=} {
   262    sqlite3_reset sqlite3_column_int
   263  } {
   264    0 sqlite3 sqlite3 
   265    1 reset reset 
   266    2 sqlite3 sqlite3 
   267    3 column column 
   268    4 int int
   269  }
   270  
   271  do_unicode_token_test3 5.2 {tokenchars=_} {
   272    sqlite3_reset sqlite3_column_int
   273  } {
   274    0 sqlite3_reset sqlite3_reset 
   275    1 sqlite3_column_int sqlite3_column_int
   276  }
   277  
   278  do_unicode_token_test3 5.3 {separators=xyz} {
   279    Laotianxhorseyrunszfast
   280  } {
   281    0 laotian Laotian
   282    1 horse horse
   283    2 runs runs
   284    3 fast fast
   285  }
   286  
   287  do_unicode_token_test3 5.4 {tokenchars=xyz} {
   288    Laotianxhorseyrunszfast
   289  } {
   290    0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
   291  }
   292  
   293  do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
   294    sqlite3_resetxsqlite3_column_intyhonda_phantom
   295  } {
   296    0 sqlite3_reset sqlite3_reset 
   297    1 sqlite3_column_int sqlite3_column_int
   298    2 honda_phantom honda_phantom
   299  }
   300  
   301  do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
   302    0 abc abc 1 def def
   303  }
   304  
   305  do_unicode_token_test3 5.7                             \
   306    "tokenchars=\u2444\u2445"                            \
   307    "separators=\u05D0\u05D1\u05D2"                      \
   308    "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
   309    [list                                                \
   310      0 \u2444fre\u2445sh \u2444fre\u2445sh              \
   311      1 water water                                      \
   312      2 fish fish                                        \
   313      3 \u2445timer \u2445timer                          \
   314    ]
   315  
   316  # Check that it is not possible to add a standalone diacritic codepoint 
   317  # to either separators or tokenchars.
   318  do_unicode_token_test3 5.8 "separators=\u0301" \
   319    "hello\u0301world \u0301helloworld"          \
   320    "0 helloworld hello\u0301world 1 helloworld helloworld"
   321  
   322  do_unicode_token_test3 5.9 "tokenchars=\u0301" \
   323    "hello\u0301world \u0301helloworld"          \
   324    "0 helloworld hello\u0301world 1 helloworld helloworld"
   325  
   326  do_unicode_token_test3 5.10 "separators=\u0301" \
   327    "remove_diacritics=0"                        \
   328    "hello\u0301world \u0301helloworld"          \
   329    "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
   330  
   331  do_unicode_token_test3 5.11 "tokenchars=\u0301" \
   332    "remove_diacritics=0"                         \
   333    "hello\u0301world \u0301helloworld"           \
   334    "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
   335  
   336  
   337  #-------------------------------------------------------------------------
   338  
   339  proc do_tokenize {tokenizer txt} {
   340    set res [list]
   341    foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
   342      lappend res $b
   343    }
   344    set res
   345  }
   346  
   347  # Argument $lCodepoint must be a list of codepoints (integers) that 
   348  # correspond to whitespace characters. This command creates a string
   349  # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}" 
   350  # using tokenizer $tokenizer. The test passes if the tokenizer successfully
   351  # extracts the two 5 character tokens.
   352  #
   353  proc do_isspace_test {tn tokenizer lCp} {
   354    set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp] 
   355    set txt "${whitespace}hello${whitespace}world${whitespace}"
   356    uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
   357  }
   358  
   359  set tokenizers [list unicode61]
   360  ifcapable icu { lappend tokenizers icu }
   361  
   362  # Some tests to check that the tokenizers can both identify white-space 
   363  # codepoints. All codepoints tested below are of type "Zs" in the
   364  # UnicodeData.txt file.
   365  #
   366  # Note that codepoint 6158 has changed from Zs to Cf in recent versions
   367  # of UnicodeData.txt.  So take that into account for the "icu" tests.
   368  #
   369  foreach T $tokenizers {
   370    do_isspace_test 6.$T.1 $T    32
   371    do_isspace_test 6.$T.2 $T    160
   372    do_isspace_test 6.$T.3 $T    5760
   373    if {$T!="icu"} {
   374      do_isspace_test 6.$T.4 $T    6158
   375    }
   376    do_isspace_test 6.$T.5 $T    8192
   377    do_isspace_test 6.$T.6 $T    8193
   378    do_isspace_test 6.$T.7 $T    8194
   379    do_isspace_test 6.$T.8 $T    8195
   380    do_isspace_test 6.$T.9 $T    8196
   381    do_isspace_test 6.$T.10 $T    8197
   382    do_isspace_test 6.$T.11 $T    8198
   383    do_isspace_test 6.$T.12 $T    8199
   384    do_isspace_test 6.$T.13 $T    8200
   385    do_isspace_test 6.$T.14 $T    8201
   386    do_isspace_test 6.$T.15 $T    8202
   387    if {$T!="icu"} {
   388      do_isspace_test 6.$T.16 $T    8239
   389    }
   390    do_isspace_test 6.$T.17 $T    8287
   391    do_isspace_test 6.$T.18 $T   12288
   392  
   393    if {$T!="icu"} {
   394      do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
   395    } else {
   396      do_isspace_test 6.$T.19 $T   {32 160 5760 8192}
   397    }
   398    do_isspace_test 6.$T.20 $T   {8192 8193 8194 8195}
   399    do_isspace_test 6.$T.21 $T   {8196 8197 8198 8199}
   400    do_isspace_test 6.$T.22 $T   {8200 8201 8202 8239}
   401    do_isspace_test 6.$T.23 $T   {8287 12288}
   402  }
   403  
   404  #-------------------------------------------------------------------------
   405  # Test that the private use ranges are treated as alphanumeric.
   406  #
   407  foreach {tn1 c} {
   408    1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
   409  } {
   410    foreach {tn2 config res} {
   411      1 ""             "0 hello*world hello*world"
   412      2 "separators=*" "0 hello hello 1 world world"
   413    } {
   414      set config [string map [list * $c] $config]
   415      set input  [string map [list * $c] "hello*world"]
   416      set output [string map [list * $c] $res]
   417      do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
   418    }
   419  }
   420  
   421  #-------------------------------------------------------------------------
   422  # Cursory test of remove_diacritics=0.
   423  #
   424  # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
   425  # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
   426  # 00E4;LATIN SMALL LETTER A WITH DIAERESIS
   427  # 00F6;LATIN SMALL LETTER O WITH DIAERESIS
   428  #
   429  do_execsql_test 8.1.1 "
   430    CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');
   431    INSERT INTO t3 VALUES('o');
   432    INSERT INTO t3 VALUES('a');
   433    INSERT INTO t3 VALUES('O');
   434    INSERT INTO t3 VALUES('A');
   435    INSERT INTO t3 VALUES('\xD6');
   436    INSERT INTO t3 VALUES('\xC4');
   437    INSERT INTO t3 VALUES('\xF6');
   438    INSERT INTO t3 VALUES('\xE4');
   439  "
   440  do_execsql_test 8.1.2 {
   441    SELECT rowid FROM t3 WHERE t3 MATCH 'o';
   442  } {1 3 5 7}
   443  do_execsql_test 8.1.3 {
   444    SELECT rowid FROM t3 WHERE t3 MATCH 'a';
   445  } {2 4 6 8}
   446  do_execsql_test 8.2.1 {
   447    CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");
   448    INSERT INTO t4 SELECT * FROM t3;
   449  }
   450  do_execsql_test 8.2.2 {
   451    SELECT rowid FROM t4 WHERE t4 MATCH 'o';
   452  } {1 3}
   453  do_execsql_test 8.2.3 {
   454    SELECT rowid FROM t4 WHERE t4 MATCH 'a';
   455  } {2 4}
   456  
   457  #-------------------------------------------------------------------------
   458  #
   459  foreach {tn sql} {
   460    1 {
   461      CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
   462      CREATE VIRTUAL TABLE t6 USING fts4(
   463          tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
   464      CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
   465    }
   466    2 {
   467      CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
   468      CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
   469      CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4");
   470    }
   471    3 {
   472      CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .');
   473      CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]');
   474      CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4');
   475    }
   476    4 {
   477      CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`);
   478      CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`);
   479      CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`);
   480    }
   481  } {
   482    do_execsql_test 9.$tn.0 { 
   483      DROP TABLE IF EXISTS t5;
   484      DROP TABLE IF EXISTS t5aux;
   485      DROP TABLE IF EXISTS t6;
   486      DROP TABLE IF EXISTS t6aux;
   487      DROP TABLE IF EXISTS t7;
   488      DROP TABLE IF EXISTS t7aux;
   489    }
   490    do_execsql_test 9.$tn.1 $sql
   491  
   492    do_execsql_test 9.$tn.2 {
   493      CREATE VIRTUAL TABLE t5aux USING fts4aux(t5);
   494      INSERT INTO t5 VALUES('one two three/four.five.six');
   495      SELECT * FROM t5aux;
   496    } {
   497      four.five.six   * 1 1 four.five.six   0 1 1 
   498      {one two three} * 1 1 {one two three} 0 1 1
   499    }
   500  
   501    do_execsql_test 9.$tn.3 {
   502      CREATE VIRTUAL TABLE t6aux USING fts4aux(t6);
   503      INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta');
   504      SELECT * FROM t6aux;
   505    } {
   506      {alpha=beta"gamma}   * 1 1 {alpha=beta"gamma} 0 1 1 
   507      {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1
   508    }
   509  
   510    do_execsql_test 9.$tn.4 {
   511      CREATE VIRTUAL TABLE t7aux USING fts4aux(t7);
   512      INSERT INTO t7 VALUES('alephxbeth\xC4gimel');
   513      SELECT * FROM t7aux;
   514    } {
   515      aleph * 1 1 aleph 0 1 1 
   516      beth  * 1 1 beth  0 1 1 
   517      gimel * 1 1 gimel 0 1 1
   518    }
   519  }
   520  
   521  # Check that multiple options are handled correctly.
   522  #
   523  do_execsql_test 10.1 {
   524    DROP TABLE IF EXISTS t1;
   525    CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61
   526      "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy"
   527      "separators=a" "separators=a" "tokenchars=a" "tokenchars=a"
   528    );
   529  
   530    INSERT INTO t1 VALUES('oneatwoxthreeyfour');
   531    INSERT INTO t1 VALUES('a.single=word');
   532    CREATE VIRTUAL TABLE t1aux USING fts4aux(t1);
   533    SELECT * FROM t1aux;
   534  } {
   535    .single=word * 1 1 .single=word 0 1 1 
   536    four         * 1 1 four         0 1 1 
   537    one          * 1 1 one          0 1 1 
   538    three        * 1 1 three        0 1 1 
   539    two          * 1 1 two          0 1 1
   540  }
   541  
   542  # Test that case folding happens after tokenization, not before.
   543  #
   544  do_execsql_test 10.2 {
   545    DROP TABLE IF EXISTS t2;
   546    CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB");
   547    INSERT INTO t2 VALUES('oneatwoBthree');
   548    INSERT INTO t2 VALUES('onebtwoAthree');
   549    CREATE VIRTUAL TABLE t2aux USING fts4aux(t2);
   550    SELECT * FROM t2aux;
   551  } {
   552    one           * 1 1 one           0 1 1 
   553    onebtwoathree * 1 1 onebtwoathree 0 1 1 
   554    three         * 1 1 three         0 1 1 
   555    two           * 1 1 two           0 1 1
   556  }
   557  
   558  # Test that the tokenchars and separators options work with the 
   559  # fts3tokenize table.
   560  #
   561  do_execsql_test 11.1 {
   562    CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
   563      "unicode61", "tokenchars=@.", "separators=1234567890"
   564    );
   565    SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';
   566  } {
   567    berlin@street sydney.road
   568  }
   569  
   570  # Test for embedded nul characters in fts4 unicode index.
   571  #
   572  do_execsql_test 12.0 {
   573    CREATE VIRTUAL TABLE t12 USING fts4(tokenize=unicode61);
   574    INSERT INTO t12 VALUES('abc' || char(0) || 'def');
   575    SELECT hex(CAST(content AS blob)) FROM t12;
   576  } {61626300646566}
   577  do_execsql_test 12.1 {
   578    INSERT INTO t12(t12) VALUES('integrity-check');
   579  } {}
   580  do_execsql_test 12.2 { 
   581    CREATE VIRTUAL TABLE t12aux USING fts4aux(t12);
   582    SELECT * FROM t12aux;
   583  } {abc * 1 1 abc 0 1 1}
   584  do_execsql_test 12.3 { 
   585    SELECT hex(CAST(content AS blob)) FROM t12 WHERE t12 MATCH 'abc'
   586  } {61626300646566}
   587  
   588  finish_test