github.com/jdgcs/sqlite3@v1.12.1-0.20210908114423-bc5f96e4dd51/testdata/tcl/fts3_common.tcl (about)

     1  # 2009 November 04
     2  #
     3  # The author disclaims copyright to this source code.  In place of
     4  # a legal notice, here is a blessing:
     5  #
     6  #    May you do good and not evil.
     7  #    May you find forgiveness for yourself and forgive others.
     8  #    May you share freely, never taking more than you give.
     9  #
    10  #***********************************************************************
    11  #
    12  # This file contains common code used the fts3 tests. At one point
    13  # equivalent functionality was implemented in C code. But it is easier
    14  # to use Tcl.
    15  #
    16  
    17  #-------------------------------------------------------------------------
    18  # INSTRUCTIONS
    19  #
    20  # The following commands are available:
    21  #
    22  #   fts3_build_db_1 N
    23  #     Using database handle [db] create an FTS4 table named t1 and populate
    24  #     it with N rows of data. N must be less than 10,000. Refer to the
    25  #     header comments above the proc implementation below for details.
    26  #
    27  #   fts3_build_db_2 N
    28  #     Using database handle [db] create an FTS4 table named t2 and populate
    29  #     it with N rows of data. N must be less than 100,000. Refer to the
    30  #     header comments above the proc implementation below for details.
    31  #
    32  #   fts3_integrity_check TBL
    33  #     TBL must be an FTS table in the database currently opened by handle
    34  #     [db]. This proc loads and tokenizes all documents within the table,
    35  #     then checks that the current contents of the FTS index matches the
    36  #     results.
    37  #
    38  #   fts3_terms TBL WHERE
    39  #     Todo.
    40  #
    41  #   fts3_doclist TBL TERM WHERE
    42  #     Todo.
    43  #
    44  #
    45  #
    46  
    47  ifcapable fts3 {
    48    sqlite3_fts3_may_be_corrupt 0 
    49  }
    50  
    51  #-------------------------------------------------------------------------
    52  # USAGE: fts3_build_db_1 SWITCHES N
    53  #
    54  # Build a sample FTS table in the database opened by database connection 
    55  # [db]. The name of the new table is "t1".
    56  #
    57  proc fts3_build_db_1 {args} {
    58  
    59    set default(-module) fts4
    60  
    61    set nArg [llength $args]
    62    if {($nArg%2)==0} {
    63      error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
    64    }
    65  
    66    set n [lindex $args [expr $nArg-1]]
    67    array set opts [array get default]
    68    array set opts [lrange $args 0 [expr $nArg-2]]
    69    foreach k [array names opts] {
    70      if {0==[info exists default($k)]} { error "unknown option: $k" }
    71    }
    72  
    73    if {$n > 10000} {error "n must be <= 10000"}
    74    db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)"
    75  
    76    set xwords [list zero one two three four five six seven eight nine ten]
    77    set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
    78  
    79    for {set i 0} {$i < $n} {incr i} {
    80      set x ""
    81      set y ""
    82  
    83      set x [list]
    84      lappend x [lindex $xwords [expr ($i / 1000) % 10]]
    85      lappend x [lindex $xwords [expr ($i / 100)  % 10]]
    86      lappend x [lindex $xwords [expr ($i / 10)   % 10]]
    87      lappend x [lindex $xwords [expr ($i / 1)   % 10]]
    88  
    89      set y [list]
    90      lappend y [lindex $ywords [expr ($i / 1000) % 10]]
    91      lappend y [lindex $ywords [expr ($i / 100)  % 10]]
    92      lappend y [lindex $ywords [expr ($i / 10)   % 10]]
    93      lappend y [lindex $ywords [expr ($i / 1)   % 10]]
    94  
    95      db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) }
    96    }
    97  }
    98  
    99  #-------------------------------------------------------------------------
   100  # USAGE: fts3_build_db_2 N ARGS
   101  #
   102  # Build a sample FTS table in the database opened by database connection 
   103  # [db]. The name of the new table is "t2".
   104  #
   105  proc fts3_build_db_2 {args} {
   106  
   107    set default(-module) fts4
   108    set default(-extra)   ""
   109  
   110    set nArg [llength $args]
   111    if {($nArg%2)==0} {
   112      error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
   113    }
   114  
   115    set n [lindex $args [expr $nArg-1]]
   116    array set opts [array get default]
   117    array set opts [lrange $args 0 [expr $nArg-2]]
   118    foreach k [array names opts] {
   119      if {0==[info exists default($k)]} { error "unknown option: $k" }
   120    }
   121  
   122    if {$n > 100000} {error "n must be <= 100000"}
   123  
   124    set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content"
   125    if {$opts(-extra) != ""} {
   126      append sql ", " $opts(-extra)
   127    }
   128    append sql ")"
   129    db eval $sql
   130  
   131    set chars [list a b c d e f g h  i j k l m n o p  q r s t u v w x  y z ""]
   132  
   133    for {set i 0} {$i < $n} {incr i} {
   134      set word ""
   135      set nChar [llength $chars]
   136      append word [lindex $chars [expr {($i / 1)   % $nChar}]]
   137      append word [lindex $chars [expr {($i / $nChar)  % $nChar}]]
   138      append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]]
   139  
   140      db eval { INSERT INTO t2(docid, content) VALUES($i, $word) }
   141    }
   142  }
   143  
   144  #-------------------------------------------------------------------------
   145  # USAGE: fts3_integrity_check TBL
   146  #
   147  # This proc is used to verify that the full-text index is consistent with
   148  # the contents of the fts3 table. In other words, it checks that the
   149  # data in the %_contents table matches that in the %_segdir and %_segments 
   150  # tables.
   151  #
   152  # This is not an efficient procedure. It uses a lot of memory and a lot
   153  # of CPU. But it is better than not checking at all.
   154  #
   155  # The procedure is:
   156  #
   157  #   1) Read the entire full-text index from the %_segdir and %_segments
   158  #      tables into memory. For each entry in the index, the following is
   159  #      done:
   160  #
   161  #          set C($iDocid,$iCol,$iPosition) $zTerm
   162  #
   163  #   2) Iterate through each column of each row of the %_content table. 
   164  #      Tokenize all documents, and check that for each token there is
   165  #      a corresponding entry in the $C array. After checking a token,
   166  #      [unset] the $C array entry.
   167  #
   168  #   3) Check that array $C is now empty.
   169  #      
   170  #
   171  proc fts3_integrity_check {tbl} {
   172  
   173    fts3_read2 $tbl 1 A
   174  
   175    foreach zTerm [array names A] {
   176      #puts $zTerm
   177      foreach doclist $A($zTerm) {
   178        set docid 0
   179        while {[string length $doclist]>0} {
   180          set iCol 0
   181          set iPos 0
   182          set lPos [list]
   183          set lCol [list]
   184  
   185          # First varint of a doclist-entry is the docid. Delta-compressed
   186          # with respect to the docid of the previous entry.
   187          #
   188          incr docid [gobble_varint doclist]
   189          if {[info exists D($zTerm,$docid)]} {
   190            while {[set iDelta [gobble_varint doclist]] != 0} {}
   191            continue
   192          }
   193          set D($zTerm,$docid) 1
   194  
   195          # Gobble varints until the 0x00 that terminates the doclist-entry
   196          # is found.
   197          while {[set iDelta [gobble_varint doclist]] > 0} {
   198            if {$iDelta == 1} {
   199              set iCol [gobble_varint doclist]
   200              set iPos 0
   201            } else {
   202              incr iPos $iDelta
   203              incr iPos -2
   204              set C($docid,$iCol,$iPos) $zTerm
   205            }
   206          }
   207        }
   208      }
   209    }
   210  
   211    foreach key [array names C] {
   212      #puts "$key -> $C($key)"
   213    }
   214  
   215  
   216    db eval "SELECT * FROM ${tbl}_content" E {
   217      set iCol 0
   218      set iDoc $E(docid)
   219      foreach col [lrange $E(*) 1 end] {
   220        set c $E($col)
   221        set sql {SELECT fts3_tokenizer_test('simple', $c)}
   222  
   223        foreach {pos term dummy} [db one $sql] {
   224          if {![info exists C($iDoc,$iCol,$pos)]} {
   225            set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
   226            lappend errors $es
   227          } else {
   228            if {[string compare $C($iDoc,$iCol,$pos) $term]} {
   229              set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
   230              append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
   231              lappend errors $es
   232            }
   233            unset C($iDoc,$iCol,$pos)
   234          }
   235        }
   236        incr iCol
   237      }
   238    }
   239  
   240    foreach c [array names C] {
   241      lappend errors "Bad index entry: $c -> $C($c)"
   242    }
   243  
   244    if {[info exists errors]} { return [join $errors "\n"] }
   245    return "ok"
   246  }
   247  
   248  # USAGE: fts3_terms TBL WHERE
   249  #
   250  # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
   251  # SQL expression that will be used as the WHERE clause when scanning
   252  # the %_segdir table. As in the following query:
   253  #
   254  #   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
   255  #
   256  # This function returns a list of all terms present in the segments
   257  # selected by the statement above.
   258  #
   259  proc fts3_terms {tbl where} {
   260    fts3_read $tbl $where a
   261    return [lsort [array names a]]
   262  }
   263  
   264  
   265  # USAGE: fts3_doclist TBL TERM WHERE
   266  #
   267  # Argument TBL must be the name of an FTS3 table. TERM is a term that may
   268  # or may not be present in the table. Argument WHERE is used to select a 
   269  # subset of the b-tree segments in the associated full-text index as 
   270  # described above for [fts3_terms].
   271  #
   272  # This function returns the results of merging the doclists associated
   273  # with TERM in the selected segments. Each doclist is an element of the
   274  # returned list. Each doclist is formatted as follows:
   275  #
   276  #   [$docid ?$col[$off1 $off2...]?...]
   277  #
   278  # The formatting is odd for a Tcl command in order to be compatible with
   279  # the original C-language implementation. If argument WHERE is "1", then 
   280  # any empty doclists are omitted from the returned list.
   281  #
   282  proc fts3_doclist {tbl term where} {
   283    fts3_read $tbl $where a
   284  
   285  
   286    foreach doclist $a($term) {
   287      set docid 0
   288  
   289      while {[string length $doclist]>0} {
   290        set iCol 0
   291        set iPos 0
   292        set lPos [list]
   293        set lCol [list]
   294        incr docid [gobble_varint doclist]
   295    
   296        while {[set iDelta [gobble_varint doclist]] > 0} {
   297          if {$iDelta == 1} {
   298            lappend lCol [list $iCol $lPos]
   299            set iPos 0
   300            set lPos [list]
   301            set iCol [gobble_varint doclist]
   302          } else {
   303            incr iPos $iDelta
   304            incr iPos -2
   305            lappend lPos $iPos
   306          }
   307        }
   308    
   309        if {[llength $lPos]>0} {
   310          lappend lCol [list $iCol $lPos]
   311        }
   312    
   313        if {$where != "1" || [llength $lCol]>0} {
   314          set ret($docid) $lCol
   315        } else {
   316          unset -nocomplain ret($docid)
   317        }
   318      }
   319    }
   320  
   321    set lDoc [list]
   322    foreach docid [lsort -integer [array names ret]] {
   323      set lCol [list]
   324      set cols ""
   325      foreach col $ret($docid) {
   326        foreach {iCol lPos} $col {}
   327        append cols " $iCol\[[join $lPos { }]\]"
   328      }
   329      lappend lDoc "\[${docid}${cols}\]"
   330    }
   331  
   332    join $lDoc " "
   333  }
   334  
   335  ###########################################################################
   336  
   337  proc gobble_varint {varname} {
   338    upvar $varname blob
   339    set n [read_fts3varint $blob ret]
   340    set blob [string range $blob $n end]
   341    return $ret
   342  }
   343  proc gobble_string {varname nLength} {
   344    upvar $varname blob
   345    set ret [string range $blob 0 [expr $nLength-1]]
   346    set blob [string range $blob $nLength end]
   347    return $ret
   348  }
   349  
   350  # The argument is a blob of data representing an FTS3 segment leaf. 
   351  # Return a list consisting of alternating terms (strings) and doclists
   352  # (blobs of data).
   353  #
   354  proc fts3_readleaf {blob} {
   355    set zPrev ""
   356    set terms [list]
   357  
   358    while {[string length $blob] > 0} {
   359      set nPrefix [gobble_varint blob]
   360      set nSuffix [gobble_varint blob]
   361  
   362      set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
   363      append zTerm [gobble_string blob $nSuffix]
   364      set nDoclist [gobble_varint blob]
   365      set doclist [gobble_string blob $nDoclist]
   366  
   367      lappend terms $zTerm $doclist
   368      set zPrev $zTerm
   369    }
   370  
   371    return $terms
   372  }
   373  
   374  proc fts3_read2 {tbl where varname} {
   375    upvar $varname a
   376    array unset a
   377    db eval " SELECT start_block, leaves_end_block, root 
   378              FROM ${tbl}_segdir WHERE $where
   379              ORDER BY level ASC, idx DESC
   380    " {
   381      set c 0
   382      binary scan $root c c
   383      if {$c==0} {
   384        foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
   385      } else {
   386        db eval " SELECT block 
   387                  FROM ${tbl}_segments 
   388                  WHERE blockid>=$start_block AND blockid<=$leaves_end_block
   389                  ORDER BY blockid
   390        " {
   391          foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
   392        }
   393      }
   394    }
   395  }
   396  
   397  proc fts3_read {tbl where varname} {
   398    upvar $varname a
   399    array unset a
   400    db eval " SELECT start_block, leaves_end_block, root 
   401              FROM ${tbl}_segdir WHERE $where
   402              ORDER BY level DESC, idx ASC
   403    " {
   404      if {$start_block == 0} {
   405        foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
   406      } else {
   407        db eval " SELECT block 
   408                  FROM ${tbl}_segments 
   409                  WHERE blockid>=$start_block AND blockid<$leaves_end_block
   410                  ORDER BY blockid
   411        " {
   412          foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
   413  
   414        }
   415      }
   416    }
   417  }
   418  
   419  ##########################################################################
   420