github.com/jdgcs/sqlite3@v1.12.1-0.20210908114423-bc5f96e4dd51/testdata/tcl/fts3_common.tcl (about) 1 # 2009 November 04 2 # 3 # The author disclaims copyright to this source code. In place of 4 # a legal notice, here is a blessing: 5 # 6 # May you do good and not evil. 7 # May you find forgiveness for yourself and forgive others. 8 # May you share freely, never taking more than you give. 9 # 10 #*********************************************************************** 11 # 12 # This file contains common code used the fts3 tests. At one point 13 # equivalent functionality was implemented in C code. But it is easier 14 # to use Tcl. 15 # 16 17 #------------------------------------------------------------------------- 18 # INSTRUCTIONS 19 # 20 # The following commands are available: 21 # 22 # fts3_build_db_1 N 23 # Using database handle [db] create an FTS4 table named t1 and populate 24 # it with N rows of data. N must be less than 10,000. Refer to the 25 # header comments above the proc implementation below for details. 26 # 27 # fts3_build_db_2 N 28 # Using database handle [db] create an FTS4 table named t2 and populate 29 # it with N rows of data. N must be less than 100,000. Refer to the 30 # header comments above the proc implementation below for details. 31 # 32 # fts3_integrity_check TBL 33 # TBL must be an FTS table in the database currently opened by handle 34 # [db]. This proc loads and tokenizes all documents within the table, 35 # then checks that the current contents of the FTS index matches the 36 # results. 37 # 38 # fts3_terms TBL WHERE 39 # Todo. 40 # 41 # fts3_doclist TBL TERM WHERE 42 # Todo. 43 # 44 # 45 # 46 47 ifcapable fts3 { 48 sqlite3_fts3_may_be_corrupt 0 49 } 50 51 #------------------------------------------------------------------------- 52 # USAGE: fts3_build_db_1 SWITCHES N 53 # 54 # Build a sample FTS table in the database opened by database connection 55 # [db]. The name of the new table is "t1". 56 # 57 proc fts3_build_db_1 {args} { 58 59 set default(-module) fts4 60 61 set nArg [llength $args] 62 if {($nArg%2)==0} { 63 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" 64 } 65 66 set n [lindex $args [expr $nArg-1]] 67 array set opts [array get default] 68 array set opts [lrange $args 0 [expr $nArg-2]] 69 foreach k [array names opts] { 70 if {0==[info exists default($k)]} { error "unknown option: $k" } 71 } 72 73 if {$n > 10000} {error "n must be <= 10000"} 74 db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)" 75 76 set xwords [list zero one two three four five six seven eight nine ten] 77 set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa] 78 79 for {set i 0} {$i < $n} {incr i} { 80 set x "" 81 set y "" 82 83 set x [list] 84 lappend x [lindex $xwords [expr ($i / 1000) % 10]] 85 lappend x [lindex $xwords [expr ($i / 100) % 10]] 86 lappend x [lindex $xwords [expr ($i / 10) % 10]] 87 lappend x [lindex $xwords [expr ($i / 1) % 10]] 88 89 set y [list] 90 lappend y [lindex $ywords [expr ($i / 1000) % 10]] 91 lappend y [lindex $ywords [expr ($i / 100) % 10]] 92 lappend y [lindex $ywords [expr ($i / 10) % 10]] 93 lappend y [lindex $ywords [expr ($i / 1) % 10]] 94 95 db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) } 96 } 97 } 98 99 #------------------------------------------------------------------------- 100 # USAGE: fts3_build_db_2 N ARGS 101 # 102 # Build a sample FTS table in the database opened by database connection 103 # [db]. The name of the new table is "t2". 104 # 105 proc fts3_build_db_2 {args} { 106 107 set default(-module) fts4 108 set default(-extra) "" 109 110 set nArg [llength $args] 111 if {($nArg%2)==0} { 112 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" 113 } 114 115 set n [lindex $args [expr $nArg-1]] 116 array set opts [array get default] 117 array set opts [lrange $args 0 [expr $nArg-2]] 118 foreach k [array names opts] { 119 if {0==[info exists default($k)]} { error "unknown option: $k" } 120 } 121 122 if {$n > 100000} {error "n must be <= 100000"} 123 124 set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content" 125 if {$opts(-extra) != ""} { 126 append sql ", " $opts(-extra) 127 } 128 append sql ")" 129 db eval $sql 130 131 set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""] 132 133 for {set i 0} {$i < $n} {incr i} { 134 set word "" 135 set nChar [llength $chars] 136 append word [lindex $chars [expr {($i / 1) % $nChar}]] 137 append word [lindex $chars [expr {($i / $nChar) % $nChar}]] 138 append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]] 139 140 db eval { INSERT INTO t2(docid, content) VALUES($i, $word) } 141 } 142 } 143 144 #------------------------------------------------------------------------- 145 # USAGE: fts3_integrity_check TBL 146 # 147 # This proc is used to verify that the full-text index is consistent with 148 # the contents of the fts3 table. In other words, it checks that the 149 # data in the %_contents table matches that in the %_segdir and %_segments 150 # tables. 151 # 152 # This is not an efficient procedure. It uses a lot of memory and a lot 153 # of CPU. But it is better than not checking at all. 154 # 155 # The procedure is: 156 # 157 # 1) Read the entire full-text index from the %_segdir and %_segments 158 # tables into memory. For each entry in the index, the following is 159 # done: 160 # 161 # set C($iDocid,$iCol,$iPosition) $zTerm 162 # 163 # 2) Iterate through each column of each row of the %_content table. 164 # Tokenize all documents, and check that for each token there is 165 # a corresponding entry in the $C array. After checking a token, 166 # [unset] the $C array entry. 167 # 168 # 3) Check that array $C is now empty. 169 # 170 # 171 proc fts3_integrity_check {tbl} { 172 173 fts3_read2 $tbl 1 A 174 175 foreach zTerm [array names A] { 176 #puts $zTerm 177 foreach doclist $A($zTerm) { 178 set docid 0 179 while {[string length $doclist]>0} { 180 set iCol 0 181 set iPos 0 182 set lPos [list] 183 set lCol [list] 184 185 # First varint of a doclist-entry is the docid. Delta-compressed 186 # with respect to the docid of the previous entry. 187 # 188 incr docid [gobble_varint doclist] 189 if {[info exists D($zTerm,$docid)]} { 190 while {[set iDelta [gobble_varint doclist]] != 0} {} 191 continue 192 } 193 set D($zTerm,$docid) 1 194 195 # Gobble varints until the 0x00 that terminates the doclist-entry 196 # is found. 197 while {[set iDelta [gobble_varint doclist]] > 0} { 198 if {$iDelta == 1} { 199 set iCol [gobble_varint doclist] 200 set iPos 0 201 } else { 202 incr iPos $iDelta 203 incr iPos -2 204 set C($docid,$iCol,$iPos) $zTerm 205 } 206 } 207 } 208 } 209 } 210 211 foreach key [array names C] { 212 #puts "$key -> $C($key)" 213 } 214 215 216 db eval "SELECT * FROM ${tbl}_content" E { 217 set iCol 0 218 set iDoc $E(docid) 219 foreach col [lrange $E(*) 1 end] { 220 set c $E($col) 221 set sql {SELECT fts3_tokenizer_test('simple', $c)} 222 223 foreach {pos term dummy} [db one $sql] { 224 if {![info exists C($iDoc,$iCol,$pos)]} { 225 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" 226 lappend errors $es 227 } else { 228 if {[string compare $C($iDoc,$iCol,$pos) $term]} { 229 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " 230 append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" 231 lappend errors $es 232 } 233 unset C($iDoc,$iCol,$pos) 234 } 235 } 236 incr iCol 237 } 238 } 239 240 foreach c [array names C] { 241 lappend errors "Bad index entry: $c -> $C($c)" 242 } 243 244 if {[info exists errors]} { return [join $errors "\n"] } 245 return "ok" 246 } 247 248 # USAGE: fts3_terms TBL WHERE 249 # 250 # Argument TBL must be the name of an FTS3 table. Argument WHERE is an 251 # SQL expression that will be used as the WHERE clause when scanning 252 # the %_segdir table. As in the following query: 253 # 254 # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" 255 # 256 # This function returns a list of all terms present in the segments 257 # selected by the statement above. 258 # 259 proc fts3_terms {tbl where} { 260 fts3_read $tbl $where a 261 return [lsort [array names a]] 262 } 263 264 265 # USAGE: fts3_doclist TBL TERM WHERE 266 # 267 # Argument TBL must be the name of an FTS3 table. TERM is a term that may 268 # or may not be present in the table. Argument WHERE is used to select a 269 # subset of the b-tree segments in the associated full-text index as 270 # described above for [fts3_terms]. 271 # 272 # This function returns the results of merging the doclists associated 273 # with TERM in the selected segments. Each doclist is an element of the 274 # returned list. Each doclist is formatted as follows: 275 # 276 # [$docid ?$col[$off1 $off2...]?...] 277 # 278 # The formatting is odd for a Tcl command in order to be compatible with 279 # the original C-language implementation. If argument WHERE is "1", then 280 # any empty doclists are omitted from the returned list. 281 # 282 proc fts3_doclist {tbl term where} { 283 fts3_read $tbl $where a 284 285 286 foreach doclist $a($term) { 287 set docid 0 288 289 while {[string length $doclist]>0} { 290 set iCol 0 291 set iPos 0 292 set lPos [list] 293 set lCol [list] 294 incr docid [gobble_varint doclist] 295 296 while {[set iDelta [gobble_varint doclist]] > 0} { 297 if {$iDelta == 1} { 298 lappend lCol [list $iCol $lPos] 299 set iPos 0 300 set lPos [list] 301 set iCol [gobble_varint doclist] 302 } else { 303 incr iPos $iDelta 304 incr iPos -2 305 lappend lPos $iPos 306 } 307 } 308 309 if {[llength $lPos]>0} { 310 lappend lCol [list $iCol $lPos] 311 } 312 313 if {$where != "1" || [llength $lCol]>0} { 314 set ret($docid) $lCol 315 } else { 316 unset -nocomplain ret($docid) 317 } 318 } 319 } 320 321 set lDoc [list] 322 foreach docid [lsort -integer [array names ret]] { 323 set lCol [list] 324 set cols "" 325 foreach col $ret($docid) { 326 foreach {iCol lPos} $col {} 327 append cols " $iCol\[[join $lPos { }]\]" 328 } 329 lappend lDoc "\[${docid}${cols}\]" 330 } 331 332 join $lDoc " " 333 } 334 335 ########################################################################### 336 337 proc gobble_varint {varname} { 338 upvar $varname blob 339 set n [read_fts3varint $blob ret] 340 set blob [string range $blob $n end] 341 return $ret 342 } 343 proc gobble_string {varname nLength} { 344 upvar $varname blob 345 set ret [string range $blob 0 [expr $nLength-1]] 346 set blob [string range $blob $nLength end] 347 return $ret 348 } 349 350 # The argument is a blob of data representing an FTS3 segment leaf. 351 # Return a list consisting of alternating terms (strings) and doclists 352 # (blobs of data). 353 # 354 proc fts3_readleaf {blob} { 355 set zPrev "" 356 set terms [list] 357 358 while {[string length $blob] > 0} { 359 set nPrefix [gobble_varint blob] 360 set nSuffix [gobble_varint blob] 361 362 set zTerm [string range $zPrev 0 [expr $nPrefix-1]] 363 append zTerm [gobble_string blob $nSuffix] 364 set nDoclist [gobble_varint blob] 365 set doclist [gobble_string blob $nDoclist] 366 367 lappend terms $zTerm $doclist 368 set zPrev $zTerm 369 } 370 371 return $terms 372 } 373 374 proc fts3_read2 {tbl where varname} { 375 upvar $varname a 376 array unset a 377 db eval " SELECT start_block, leaves_end_block, root 378 FROM ${tbl}_segdir WHERE $where 379 ORDER BY level ASC, idx DESC 380 " { 381 set c 0 382 binary scan $root c c 383 if {$c==0} { 384 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } 385 } else { 386 db eval " SELECT block 387 FROM ${tbl}_segments 388 WHERE blockid>=$start_block AND blockid<=$leaves_end_block 389 ORDER BY blockid 390 " { 391 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } 392 } 393 } 394 } 395 } 396 397 proc fts3_read {tbl where varname} { 398 upvar $varname a 399 array unset a 400 db eval " SELECT start_block, leaves_end_block, root 401 FROM ${tbl}_segdir WHERE $where 402 ORDER BY level DESC, idx ASC 403 " { 404 if {$start_block == 0} { 405 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } 406 } else { 407 db eval " SELECT block 408 FROM ${tbl}_segments 409 WHERE blockid>=$start_block AND blockid<$leaves_end_block 410 ORDER BY blockid 411 " { 412 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } 413 414 } 415 } 416 } 417 } 418 419 ########################################################################## 420