modernc.org/cc@v1.0.1/v2/testdata/_sqlite/test/fts3_common.tcl (about)

     1  # 2009 November 04
     2  #
     3  # The author disclaims copyright to this source code.  In place of
     4  # a legal notice, here is a blessing:
     5  #
     6  #    May you do good and not evil.
     7  #    May you find forgiveness for yourself and forgive others.
     8  #    May you share freely, never taking more than you give.
     9  #
    10  #***********************************************************************
    11  #
    12  # This file contains common code used the fts3 tests. At one point
    13  # equivalent functionality was implemented in C code. But it is easier
    14  # to use Tcl.
    15  #
    16  
    17  #-------------------------------------------------------------------------
    18  # INSTRUCTIONS
    19  #
    20  # The following commands are available:
    21  #
    22  #   fts3_build_db_1 N
    23  #     Using database handle [db] create an FTS4 table named t1 and populate
    24  #     it with N rows of data. N must be less than 10,000. Refer to the
    25  #     header comments above the proc implementation below for details.
    26  #
    27  #   fts3_build_db_2 N
    28  #     Using database handle [db] create an FTS4 table named t2 and populate
    29  #     it with N rows of data. N must be less than 100,000. Refer to the
    30  #     header comments above the proc implementation below for details.
    31  #
    32  #   fts3_integrity_check TBL
    33  #     TBL must be an FTS table in the database currently opened by handle
    34  #     [db]. This proc loads and tokenizes all documents within the table,
    35  #     then checks that the current contents of the FTS index matches the
    36  #     results.
    37  #
    38  #   fts3_terms TBL WHERE
    39  #     Todo.
    40  #
    41  #   fts3_doclist TBL TERM WHERE
    42  #     Todo.
    43  #
    44  #
    45  #
    46  
    47  #-------------------------------------------------------------------------
    48  # USAGE: fts3_build_db_1 SWITCHES N
    49  #
    50  # Build a sample FTS table in the database opened by database connection 
    51  # [db]. The name of the new table is "t1".
    52  #
    53  proc fts3_build_db_1 {args} {
    54  
    55    set default(-module) fts4
    56  
    57    set nArg [llength $args]
    58    if {($nArg%2)==0} {
    59      error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
    60    }
    61  
    62    set n [lindex $args [expr $nArg-1]]
    63    array set opts [array get default]
    64    array set opts [lrange $args 0 [expr $nArg-2]]
    65    foreach k [array names opts] {
    66      if {0==[info exists default($k)]} { error "unknown option: $k" }
    67    }
    68  
    69    if {$n > 10000} {error "n must be <= 10000"}
    70    db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)"
    71  
    72    set xwords [list zero one two three four five six seven eight nine ten]
    73    set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
    74  
    75    for {set i 0} {$i < $n} {incr i} {
    76      set x ""
    77      set y ""
    78  
    79      set x [list]
    80      lappend x [lindex $xwords [expr ($i / 1000) % 10]]
    81      lappend x [lindex $xwords [expr ($i / 100)  % 10]]
    82      lappend x [lindex $xwords [expr ($i / 10)   % 10]]
    83      lappend x [lindex $xwords [expr ($i / 1)   % 10]]
    84  
    85      set y [list]
    86      lappend y [lindex $ywords [expr ($i / 1000) % 10]]
    87      lappend y [lindex $ywords [expr ($i / 100)  % 10]]
    88      lappend y [lindex $ywords [expr ($i / 10)   % 10]]
    89      lappend y [lindex $ywords [expr ($i / 1)   % 10]]
    90  
    91      db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) }
    92    }
    93  }
    94  
    95  #-------------------------------------------------------------------------
    96  # USAGE: fts3_build_db_2 N ARGS
    97  #
    98  # Build a sample FTS table in the database opened by database connection 
    99  # [db]. The name of the new table is "t2".
   100  #
   101  proc fts3_build_db_2 {args} {
   102  
   103    set default(-module) fts4
   104    set default(-extra)   ""
   105  
   106    set nArg [llength $args]
   107    if {($nArg%2)==0} {
   108      error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
   109    }
   110  
   111    set n [lindex $args [expr $nArg-1]]
   112    array set opts [array get default]
   113    array set opts [lrange $args 0 [expr $nArg-2]]
   114    foreach k [array names opts] {
   115      if {0==[info exists default($k)]} { error "unknown option: $k" }
   116    }
   117  
   118    if {$n > 100000} {error "n must be <= 100000"}
   119  
   120    set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content"
   121    if {$opts(-extra) != ""} {
   122      append sql ", " $opts(-extra)
   123    }
   124    append sql ")"
   125    db eval $sql
   126  
   127    set chars [list a b c d e f g h  i j k l m n o p  q r s t u v w x  y z ""]
   128  
   129    for {set i 0} {$i < $n} {incr i} {
   130      set word ""
   131      set nChar [llength $chars]
   132      append word [lindex $chars [expr {($i / 1)   % $nChar}]]
   133      append word [lindex $chars [expr {($i / $nChar)  % $nChar}]]
   134      append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]]
   135  
   136      db eval { INSERT INTO t2(docid, content) VALUES($i, $word) }
   137    }
   138  }
   139  
   140  #-------------------------------------------------------------------------
   141  # USAGE: fts3_integrity_check TBL
   142  #
   143  # This proc is used to verify that the full-text index is consistent with
   144  # the contents of the fts3 table. In other words, it checks that the
   145  # data in the %_contents table matches that in the %_segdir and %_segments 
   146  # tables.
   147  #
   148  # This is not an efficient procedure. It uses a lot of memory and a lot
   149  # of CPU. But it is better than not checking at all.
   150  #
   151  # The procedure is:
   152  #
   153  #   1) Read the entire full-text index from the %_segdir and %_segments
   154  #      tables into memory. For each entry in the index, the following is
   155  #      done:
   156  #
   157  #          set C($iDocid,$iCol,$iPosition) $zTerm
   158  #
   159  #   2) Iterate through each column of each row of the %_content table. 
   160  #      Tokenize all documents, and check that for each token there is
   161  #      a corresponding entry in the $C array. After checking a token,
   162  #      [unset] the $C array entry.
   163  #
   164  #   3) Check that array $C is now empty.
   165  #      
   166  #
   167  proc fts3_integrity_check {tbl} {
   168  
   169    fts3_read2 $tbl 1 A
   170  
   171    foreach zTerm [array names A] {
   172      #puts $zTerm
   173      foreach doclist $A($zTerm) {
   174        set docid 0
   175        while {[string length $doclist]>0} {
   176          set iCol 0
   177          set iPos 0
   178          set lPos [list]
   179          set lCol [list]
   180  
   181          # First varint of a doclist-entry is the docid. Delta-compressed
   182          # with respect to the docid of the previous entry.
   183          #
   184          incr docid [gobble_varint doclist]
   185          if {[info exists D($zTerm,$docid)]} {
   186            while {[set iDelta [gobble_varint doclist]] != 0} {}
   187            continue
   188          }
   189          set D($zTerm,$docid) 1
   190  
   191          # Gobble varints until the 0x00 that terminates the doclist-entry
   192          # is found.
   193          while {[set iDelta [gobble_varint doclist]] > 0} {
   194            if {$iDelta == 1} {
   195              set iCol [gobble_varint doclist]
   196              set iPos 0
   197            } else {
   198              incr iPos $iDelta
   199              incr iPos -2
   200              set C($docid,$iCol,$iPos) $zTerm
   201            }
   202          }
   203        }
   204      }
   205    }
   206  
   207    foreach key [array names C] {
   208      #puts "$key -> $C($key)"
   209    }
   210  
   211  
   212    db eval "SELECT * FROM ${tbl}_content" E {
   213      set iCol 0
   214      set iDoc $E(docid)
   215      foreach col [lrange $E(*) 1 end] {
   216        set c $E($col)
   217        set sql {SELECT fts3_tokenizer_test('simple', $c)}
   218  
   219        foreach {pos term dummy} [db one $sql] {
   220          if {![info exists C($iDoc,$iCol,$pos)]} {
   221            set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
   222            lappend errors $es
   223          } else {
   224            if {[string compare $C($iDoc,$iCol,$pos) $term]} {
   225              set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
   226              append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
   227              lappend errors $es
   228            }
   229            unset C($iDoc,$iCol,$pos)
   230          }
   231        }
   232        incr iCol
   233      }
   234    }
   235  
   236    foreach c [array names C] {
   237      lappend errors "Bad index entry: $c -> $C($c)"
   238    }
   239  
   240    if {[info exists errors]} { return [join $errors "\n"] }
   241    return "ok"
   242  }
   243  
   244  # USAGE: fts3_terms TBL WHERE
   245  #
   246  # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
   247  # SQL expression that will be used as the WHERE clause when scanning
   248  # the %_segdir table. As in the following query:
   249  #
   250  #   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
   251  #
   252  # This function returns a list of all terms present in the segments
   253  # selected by the statement above.
   254  #
   255  proc fts3_terms {tbl where} {
   256    fts3_read $tbl $where a
   257    return [lsort [array names a]]
   258  }
   259  
   260  
   261  # USAGE: fts3_doclist TBL TERM WHERE
   262  #
   263  # Argument TBL must be the name of an FTS3 table. TERM is a term that may
   264  # or may not be present in the table. Argument WHERE is used to select a 
   265  # subset of the b-tree segments in the associated full-text index as 
   266  # described above for [fts3_terms].
   267  #
   268  # This function returns the results of merging the doclists associated
   269  # with TERM in the selected segments. Each doclist is an element of the
   270  # returned list. Each doclist is formatted as follows:
   271  #
   272  #   [$docid ?$col[$off1 $off2...]?...]
   273  #
   274  # The formatting is odd for a Tcl command in order to be compatible with
   275  # the original C-language implementation. If argument WHERE is "1", then 
   276  # any empty doclists are omitted from the returned list.
   277  #
   278  proc fts3_doclist {tbl term where} {
   279    fts3_read $tbl $where a
   280  
   281  
   282    foreach doclist $a($term) {
   283      set docid 0
   284  
   285      while {[string length $doclist]>0} {
   286        set iCol 0
   287        set iPos 0
   288        set lPos [list]
   289        set lCol [list]
   290        incr docid [gobble_varint doclist]
   291    
   292        while {[set iDelta [gobble_varint doclist]] > 0} {
   293          if {$iDelta == 1} {
   294            lappend lCol [list $iCol $lPos]
   295            set iPos 0
   296            set lPos [list]
   297            set iCol [gobble_varint doclist]
   298          } else {
   299            incr iPos $iDelta
   300            incr iPos -2
   301            lappend lPos $iPos
   302          }
   303        }
   304    
   305        if {[llength $lPos]>0} {
   306          lappend lCol [list $iCol $lPos]
   307        }
   308    
   309        if {$where != "1" || [llength $lCol]>0} {
   310          set ret($docid) $lCol
   311        } else {
   312          unset -nocomplain ret($docid)
   313        }
   314      }
   315    }
   316  
   317    set lDoc [list]
   318    foreach docid [lsort -integer [array names ret]] {
   319      set lCol [list]
   320      set cols ""
   321      foreach col $ret($docid) {
   322        foreach {iCol lPos} $col {}
   323        append cols " $iCol\[[join $lPos { }]\]"
   324      }
   325      lappend lDoc "\[${docid}${cols}\]"
   326    }
   327  
   328    join $lDoc " "
   329  }
   330  
   331  ###########################################################################
   332  
   333  proc gobble_varint {varname} {
   334    upvar $varname blob
   335    set n [read_fts3varint $blob ret]
   336    set blob [string range $blob $n end]
   337    return $ret
   338  }
   339  proc gobble_string {varname nLength} {
   340    upvar $varname blob
   341    set ret [string range $blob 0 [expr $nLength-1]]
   342    set blob [string range $blob $nLength end]
   343    return $ret
   344  }
   345  
   346  # The argument is a blob of data representing an FTS3 segment leaf. 
   347  # Return a list consisting of alternating terms (strings) and doclists
   348  # (blobs of data).
   349  #
   350  proc fts3_readleaf {blob} {
   351    set zPrev ""
   352    set terms [list]
   353  
   354    while {[string length $blob] > 0} {
   355      set nPrefix [gobble_varint blob]
   356      set nSuffix [gobble_varint blob]
   357  
   358      set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
   359      append zTerm [gobble_string blob $nSuffix]
   360      set nDoclist [gobble_varint blob]
   361      set doclist [gobble_string blob $nDoclist]
   362  
   363      lappend terms $zTerm $doclist
   364      set zPrev $zTerm
   365    }
   366  
   367    return $terms
   368  }
   369  
   370  proc fts3_read2 {tbl where varname} {
   371    upvar $varname a
   372    array unset a
   373    db eval " SELECT start_block, leaves_end_block, root 
   374              FROM ${tbl}_segdir WHERE $where
   375              ORDER BY level ASC, idx DESC
   376    " {
   377      set c 0
   378      binary scan $root c c
   379      if {$c==0} {
   380        foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
   381      } else {
   382        db eval " SELECT block 
   383                  FROM ${tbl}_segments 
   384                  WHERE blockid>=$start_block AND blockid<=$leaves_end_block
   385                  ORDER BY blockid
   386        " {
   387          foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
   388        }
   389      }
   390    }
   391  }
   392  
   393  proc fts3_read {tbl where varname} {
   394    upvar $varname a
   395    array unset a
   396    db eval " SELECT start_block, leaves_end_block, root 
   397              FROM ${tbl}_segdir WHERE $where
   398              ORDER BY level DESC, idx ASC
   399    " {
   400      if {$start_block == 0} {
   401        foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
   402      } else {
   403        db eval " SELECT block 
   404                  FROM ${tbl}_segments 
   405                  WHERE blockid>=$start_block AND blockid<$leaves_end_block
   406                  ORDER BY blockid
   407        " {
   408          foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
   409  
   410        }
   411      }
   412    }
   413  }
   414  
   415  ##########################################################################
   416