modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/test/fts5tokenizer.test (about) 1 # 2014 Dec 20 2 # 3 # The author disclaims copyright to this source code. In place of 4 # a legal notice, here is a blessing: 5 # 6 # May you do good and not evil. 7 # May you find forgiveness for yourself and forgive others. 8 # May you share freely, never taking more than you give. 9 # 10 #*********************************************************************** 11 # 12 # Tests focusing on the built-in fts5 tokenizers. 13 # 14 15 source [file join [file dirname [info script]] fts5_common.tcl] 16 set testprefix fts5tokenizer 17 18 # If SQLITE_ENABLE_FTS5 is defined, omit this file. 19 ifcapable !fts5 { 20 finish_test 21 return 22 } 23 24 25 do_execsql_test 1.0 { 26 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); 27 DROP TABLE ft1; 28 } 29 do_execsql_test 1.1 { 30 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter'); 31 DROP TABLE ft1; 32 } 33 do_execsql_test 1.2 { 34 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter); 35 DROP TABLE ft1; 36 } 37 do_execsql_test 1.3 { 38 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter'); 39 DROP TABLE ft1; 40 } 41 do_execsql_test 1.4 { 42 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii'); 43 DROP TABLE ft1; 44 } 45 46 do_catchsql_test 1.5 { 47 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch'); 48 } {1 {no such tokenizer: nosuch}} 49 50 do_catchsql_test 1.6 { 51 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch'); 52 } {1 {error in tokenizer constructor}} 53 54 do_execsql_test 2.0 { 55 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); 56 INSERT INTO ft1 VALUES('embedded databases'); 57 } 58 do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1 59 do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1 60 do_execsql_test 2.3 { 61 SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' 62 } 1 63 64 proc tcl_create {args} { 65 set ::targs $args 66 error "failed" 67 } 68 sqlite3_fts5_create_tokenizer db tcl tcl_create 69 70 foreach {tn directive expected} { 71 1 {tokenize='tcl a b c'} {a b c} 72 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f} 73 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i} 74 4 {tokenize = tcl} {} 75 } { 76 do_catchsql_test 3.$tn.1 " 77 CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive) 78 " {1 {error in tokenizer constructor}} 79 do_test 3.$tn.2 { set ::targs } $expected 80 } 81 82 do_catchsql_test 4.1 { 83 CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc); 84 } {1 {parse error in "tokenize = tcl abc"}} 85 do_catchsql_test 4.2 { 86 CREATE VIRTUAL TABLE ft2 USING fts5(x y) 87 } {1 {unrecognized column option: y}} 88 89 #------------------------------------------------------------------------- 90 # Test the "separators" and "tokenchars" options a bit. 91 # 92 foreach {tn tokenizer} {1 ascii 2 unicode61} { 93 reset_db 94 set T "$tokenizer tokenchars ',.:' separators 'xyz'" 95 execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")" 96 do_execsql_test 5.$tn.1 { 97 INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz'); 98 } 99 foreach {tn2 token res} { 100 1 abc 1 2 def 1 3 ghi 1 4 jkl {} 101 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1 102 9 vw 1 103 } { 104 do_execsql_test 5.$tn.2.$tn2 " 105 SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"' 106 " $res 107 } 108 } 109 110 #------------------------------------------------------------------------- 111 # Miscellaneous tests for the ascii tokenizer. 112 # 113 # 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the 114 # 'separators' option. But unicode61 does not. 115 # 116 # 5.2.*: An option without an argument is an error. 117 # 118 119 do_test 5.1.1 { 120 execsql " 121 CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`); 122 INSERT INTO a1 VALUES('abc\u1234def'); 123 " 124 execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' } 125 } {} 126 127 do_test 5.1.2 { 128 execsql " 129 CREATE VIRTUAL TABLE a2 USING fts5( 130 x, tokenize=`unicode61 separators '\u1234'`); 131 INSERT INTO a2 VALUES('abc\u1234def'); 132 " 133 execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' } 134 } {1} 135 136 do_catchsql_test 5.2 { 137 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars'); 138 } {1 {error in tokenizer constructor}} 139 do_catchsql_test 5.3 { 140 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg'); 141 } {1 {error in tokenizer constructor}} 142 143 #------------------------------------------------------------------------- 144 # Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE 145 # correctly. 146 # 147 148 proc test_token_cb {varname token iStart iEnd} { 149 upvar $varname var 150 lappend var $token 151 if {[llength $var]==3} { return "SQLITE_DONE" } 152 return "SQLITE_OK" 153 } 154 155 proc tokenize {cmd} { 156 set res [list] 157 $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res] 158 set res 159 } 160 sqlite3_fts5_create_function db tokenize tokenize 161 162 do_execsql_test 6.0 { 163 CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii); 164 INSERT INTO x1 VALUES('q w e r t y'); 165 INSERT INTO x1 VALUES('y t r e w q'); 166 SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r'; 167 } { 168 {q w e} {y t r} 169 } 170 171 do_execsql_test 6.1 { 172 CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61); 173 INSERT INTO x2 VALUES('q w e r t y'); 174 INSERT INTO x2 VALUES('y t r e w q'); 175 SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r'; 176 } { 177 {q w e} {y t r} 178 } 179 180 181 #------------------------------------------------------------------------- 182 # Miscellaneous tests for the unicode tokenizer. 183 # 184 do_catchsql_test 6.1 { 185 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars'); 186 } {1 {error in tokenizer constructor}} 187 do_catchsql_test 6.2 { 188 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b'); 189 } {1 {error in tokenizer constructor}} 190 do_catchsql_test 6.3 { 191 CREATE VIRTUAL TABLE a3 USING fts5( 192 x, y, tokenize = 'unicode61 remove_diacritics 2' 193 ); 194 } {1 {error in tokenizer constructor}} 195 do_catchsql_test 6.4 { 196 CREATE VIRTUAL TABLE a3 USING fts5( 197 x, y, tokenize = 'unicode61 remove_diacritics 10' 198 ); 199 } {1 {error in tokenizer constructor}} 200 201 #------------------------------------------------------------------------- 202 # Porter tokenizer with very large tokens. 203 # 204 set a [string repeat a 100] 205 set b [string repeat b 500] 206 set c [string repeat c 1000] 207 do_execsql_test 7.0 { 208 CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter); 209 INSERT INTO e5 VALUES($a || ' ' || $b); 210 INSERT INTO e5 VALUES($b || ' ' || $c); 211 INSERT INTO e5 VALUES($c || ' ' || $a); 212 } 213 214 do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 } 215 do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 } 216 do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 } 217 218 #------------------------------------------------------------------------- 219 # Test the 'separators' option with the unicode61 tokenizer. 220 # 221 do_execsql_test 8.1 { 222 BEGIN; 223 CREATE VIRTUAL TABLE e6 USING fts5(x, 224 tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" 225 ); 226 INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); 227 CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); 228 SELECT term FROM e7; 229 ROLLBACK; 230 } { 231 brown dog fox jumped lazy over quick the 232 } 233 234 do_execsql_test 8.2 [subst { 235 BEGIN; 236 CREATE VIRTUAL TABLE e6 USING fts5(x, 237 tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'" 238 ); 239 INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' 240 || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog' 241 ); 242 INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09'); 243 CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); 244 SELECT term FROM e7; 245 ROLLBACK; 246 }] [subst { 247 brown dog fox jumped lazy over quick the \u0E08 \u0E09 248 }] 249 250 # Test that the porter tokenizer correctly passes arguments through to 251 # its parent tokenizer. 252 do_execsql_test 8.3 { 253 BEGIN; 254 CREATE VIRTUAL TABLE e6 USING fts5(x, 255 tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" 256 ); 257 INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); 258 CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); 259 SELECT term FROM e7; 260 ROLLBACK; 261 } { 262 brown dog fox jump lazi over quick the 263 } 264 265 #------------------------------------------------------------------------- 266 # Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer 267 # implementation. 268 # 269 reset_db 270 proc tcl_create {args} { return "tcl_tokenize" } 271 sqlite3_fts5_create_tokenizer db tcl tcl_create 272 set ::flags [list] 273 proc tcl_tokenize {tflags text} { 274 lappend ::flags $tflags 275 foreach {w iStart iEnd} [fts5_tokenize_split $text] { 276 sqlite3_fts5_token $w $iStart $iEnd 277 } 278 } 279 280 do_execsql_test 9.1.1 { 281 CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl); 282 INSERT INTO t1 VALUES('abc'); 283 INSERT INTO t1 VALUES('xyz'); 284 } {} 285 do_test 9.1.2 { set ::flags } {document document} 286 287 set ::flags [list] 288 do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc} 289 do_test 9.2.2 { set ::flags } {query} 290 291 set ::flags [list] 292 do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc} 293 do_test 9.3.2 { set ::flags } {prefixquery} 294 295 set ::flags [list] 296 do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {} 297 do_test 9.4.2 { set ::flags } {prefixquery} 298 299 set ::flags [list] 300 do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {} 301 do_test 9.5.2 { set ::flags } {query} 302 303 304 finish_test