modernc.org/cc@v1.0.1/v2/testdata/_sqlite/ext/fts5/test/fts5_common.tcl (about)

     1  # 2014 Dec 19
     2  #
     3  # The author disclaims copyright to this source code.  In place of
     4  # a legal notice, here is a blessing:
     5  #
     6  #    May you do good and not evil.
     7  #    May you find forgiveness for yourself and forgive others.
     8  #    May you share freely, never taking more than you give.
     9  #
    10  #***********************************************************************
    11  #
    12  
    13  if {![info exists testdir]} {
    14    set testdir [file join [file dirname [info script]] .. .. .. test]
    15  }
    16  source $testdir/tester.tcl
    17  
    18  ifcapable !fts5 {
    19    proc return_if_no_fts5 {} {
    20      finish_test
    21      return -code return
    22    }
    23    return
    24  } else {
    25    proc return_if_no_fts5 {} {}
    26  }
    27  
    28  catch { 
    29    sqlite3_fts5_may_be_corrupt 0 
    30    reset_db
    31  }
    32  
    33  proc fts5_test_poslist {cmd} {
    34    set res [list]
    35    for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
    36      lappend res [string map {{ } .} [$cmd xInst $i]]
    37    }
    38    set res
    39  }
    40  
    41  proc fts5_test_poslist2 {cmd} {
    42    set res [list]
    43  
    44    for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
    45      $cmd xPhraseForeach $i c o {
    46        lappend res $i.$c.$o
    47      }
    48    }
    49  
    50    #set res
    51    sort_poslist $res
    52  }
    53  
    54  proc fts5_test_collist {cmd} {
    55    set res [list]
    56  
    57    for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
    58      $cmd xPhraseColumnForeach $i c { lappend res $i.$c }
    59    }
    60  
    61    set res
    62  }
    63  
    64  proc fts5_test_columnsize {cmd} {
    65    set res [list]
    66    for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    67      lappend res [$cmd xColumnSize $i]
    68    }
    69    set res
    70  }
    71  
    72  proc fts5_test_columntext {cmd} {
    73    set res [list]
    74    for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    75      lappend res [$cmd xColumnText $i]
    76    }
    77    set res
    78  }
    79  
    80  proc fts5_test_columntotalsize {cmd} {
    81    set res [list]
    82    for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    83      lappend res [$cmd xColumnTotalSize $i]
    84    }
    85    set res
    86  }
    87  
    88  proc test_append_token {varname token iStart iEnd} {
    89    upvar $varname var
    90    lappend var $token
    91    return "SQLITE_OK"
    92  }
    93  proc fts5_test_tokenize {cmd} {
    94    set res [list]
    95    for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    96      set tokens [list]
    97      $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens]
    98      lappend res $tokens
    99    }
   100    set res
   101  }
   102  
   103  proc fts5_test_rowcount {cmd} {
   104    $cmd xRowCount
   105  }
   106  
   107  proc test_queryphrase_cb {cnt cmd} {
   108    upvar $cnt L 
   109    for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
   110      foreach {ip ic io} [$cmd xInst $i] break
   111      set A($ic) 1
   112    }
   113    foreach ic [array names A] {
   114      lset L $ic [expr {[lindex $L $ic] + 1}]
   115    }
   116  }
   117  proc fts5_test_queryphrase {cmd} {
   118    set res [list]
   119    for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
   120      set cnt [list]
   121      for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 }
   122      $cmd xQueryPhrase $i [list test_queryphrase_cb cnt]
   123      lappend res $cnt
   124    }
   125    set res
   126  }
   127  
   128  proc fts5_test_phrasecount {cmd} {
   129    $cmd xPhraseCount
   130  }
   131  
   132  proc fts5_test_all {cmd} {
   133    set res [list]
   134    lappend res columnsize      [fts5_test_columnsize $cmd]
   135    lappend res columntext      [fts5_test_columntext $cmd]
   136    lappend res columntotalsize [fts5_test_columntotalsize $cmd]
   137    lappend res poslist         [fts5_test_poslist $cmd]
   138    lappend res tokenize        [fts5_test_tokenize $cmd]
   139    lappend res rowcount        [fts5_test_rowcount $cmd]
   140    set res
   141  }
   142  
   143  proc fts5_aux_test_functions {db} {
   144    foreach f {
   145      fts5_test_columnsize
   146      fts5_test_columntext
   147      fts5_test_columntotalsize
   148      fts5_test_poslist
   149      fts5_test_poslist2
   150      fts5_test_collist
   151      fts5_test_tokenize
   152      fts5_test_rowcount
   153      fts5_test_all
   154  
   155      fts5_test_queryphrase
   156      fts5_test_phrasecount
   157    } {
   158      sqlite3_fts5_create_function $db $f $f
   159    }
   160  }
   161  
   162  proc fts5_segcount {tbl} {
   163    set N 0
   164    foreach n [fts5_level_segs $tbl] { incr N $n }
   165    set N
   166  }
   167  
   168  proc fts5_level_segs {tbl} {
   169    set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
   170    set ret [list]
   171    foreach L [lrange [db one $sql] 1 end] {
   172      lappend ret [expr [llength $L] - 3]
   173    }
   174    set ret
   175  } 
   176  
   177  proc fts5_level_segids {tbl} {
   178    set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
   179    set ret [list]
   180    foreach L [lrange [db one $sql] 1 end] {
   181      set lvl [list]
   182      foreach S [lrange $L 3 end] {
   183        regexp {id=([1234567890]*)} $S -> segid
   184        lappend lvl $segid
   185      }
   186      lappend ret $lvl
   187    }
   188    set ret
   189  }
   190  
   191  proc fts5_rnddoc {n} {
   192    set map [list 0 a  1 b  2 c  3 d  4 e  5 f  6 g  7 h  8 i  9 j]
   193    set doc [list]
   194    for {set i 0} {$i < $n} {incr i} {
   195      lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]"
   196    }
   197    set doc
   198  }
   199  
   200  #-------------------------------------------------------------------------
   201  # Usage:
   202  #
   203  #   nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2...
   204  #
   205  # This command is used to test if a document (set of column values) matches
   206  # the logical equivalent of a single FTS5 NEAR() clump and, if so, return
   207  # the equivalent of an FTS5 position list.
   208  #
   209  # Parameter $aCol is passed a list of the column values for the document
   210  # to test. Parameters $phrase1 and so on are the phrases.
   211  #
   212  # The result is a list of phrase hits. Each phrase hit is formatted as
   213  # three integers separated by "." characters, in the following format:
   214  #
   215  #   <phrase number> . <column number> . <token offset>
   216  #
   217  # Options:
   218  #
   219  #   -near N        (NEAR distance. Default 10)
   220  #   -col  C        (List of column indexes to match against)
   221  #   -pc   VARNAME  (variable in caller frame to use for phrase numbering)
   222  #   -dict VARNAME  (array in caller frame to use for synonyms)
   223  #
   224  proc nearset {aCol args} {
   225  
   226    # Process the command line options.
   227    #
   228    set O(-near) 10
   229    set O(-col)  {}
   230    set O(-pc)   ""
   231    set O(-dict) ""
   232  
   233    set nOpt [lsearch -exact $args --]
   234    if {$nOpt<0} { error "no -- option" }
   235  
   236    # Set $lPhrase to be a list of phrases. $nPhrase its length.
   237    set lPhrase [lrange $args [expr $nOpt+1] end]
   238    set nPhrase [llength $lPhrase]
   239  
   240    foreach {k v} [lrange $args 0 [expr $nOpt-1]] {
   241      if {[info exists O($k)]==0} { error "unrecognized option $k" }
   242      set O($k) $v
   243    }
   244  
   245    if {$O(-pc) == ""} {
   246      set counter 0
   247    } else {
   248      upvar $O(-pc) counter
   249    }
   250  
   251    if {$O(-dict)!=""} { upvar $O(-dict) aDict }
   252  
   253    for {set j 0} {$j < [llength $aCol]} {incr j} {
   254      for {set i 0} {$i < $nPhrase} {incr i} { 
   255        set A($j,$i) [list]
   256      }
   257    }
   258  
   259    # Loop through each column of the current row.
   260    for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
   261  
   262      # If there is a column filter, test whether this column is excluded. If
   263      # so, skip to the next iteration of this loop. Otherwise, set zCol to the
   264      # column value and nToken to the number of tokens that comprise it.
   265      if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue
   266      set zCol [lindex $aCol $iCol]
   267      set nToken [llength $zCol]
   268  
   269      # Each iteration of the following loop searches a substring of the 
   270      # column value for phrase matches. The last token of the substring
   271      # is token $iLast of the column value. The first token is:
   272      #
   273      #   iFirst = ($iLast - $O(-near) - 1)
   274      #
   275      # where $sz is the length of the phrase being searched for. A phrase 
   276      # counts as matching the substring if its first token lies on or before
   277      # $iLast and its last token on or after $iFirst.
   278      #
   279      # For example, if the query is "NEAR(a+b c, 2)" and the column value:
   280      #
   281      #   "x x x x A B x x C x"
   282      #    0 1 2 3 4 5 6 7 8 9"
   283      #
   284      # when (iLast==8 && iFirst=5) the range will contain both phrases and
   285      # so both instances can be added to the output poslists.
   286      #
   287      set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)]
   288      for { } {$iLast < $nToken} {incr iLast} {
   289  
   290        catch { array unset B }
   291        
   292        for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
   293          set p [lindex $lPhrase $iPhrase]
   294          set nPm1 [expr {[llength $p] - 1}]
   295          set iFirst [expr $iLast - $O(-near) - [llength $p]]
   296  
   297          for {set i $iFirst} {$i <= $iLast} {incr i} {
   298            set lCand [lrange $zCol $i [expr $i+$nPm1]]
   299            set bMatch 1
   300            foreach tok $p term $lCand {
   301              if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break }
   302            }
   303            if {$bMatch} { lappend B($iPhrase) $i }
   304          }
   305  
   306          if {![info exists B($iPhrase)]} break
   307        }
   308  
   309        if {$iPhrase==$nPhrase} {
   310          for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
   311            set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)]
   312            set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)]
   313          }
   314        }
   315      }
   316    }
   317  
   318    set res [list]
   319    #puts [array names A]
   320  
   321    for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
   322      for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
   323        foreach a $A($iCol,$iPhrase) {
   324          lappend res "$counter.$iCol.$a"
   325        }
   326      }
   327      incr counter
   328    }
   329  
   330    #puts "$aCol -> $res"
   331    sort_poslist $res
   332  }
   333  
   334  proc nearset_match {aDictVar tok term} {
   335    if {[string match $tok $term]} { return 1 }
   336  
   337    upvar $aDictVar aDict
   338    if {[info exists aDict($tok)]} {
   339      foreach s $aDict($tok) {
   340        if {[string match $s $term]} { return 1 }
   341      }
   342    }
   343    return 0;
   344  }
   345  
   346  #-------------------------------------------------------------------------
   347  # Usage:
   348  #
   349  #   sort_poslist LIST
   350  #
   351  # Sort a position list of the type returned by command [nearset]
   352  #
   353  proc sort_poslist {L} {
   354    lsort -command instcompare $L
   355  }
   356  proc instcompare {lhs rhs} {
   357    foreach {p1 c1 o1} [split $lhs .] {}
   358    foreach {p2 c2 o2} [split $rhs .] {}
   359  
   360    set res [expr $c1 - $c2]
   361    if {$res==0} { set res [expr $o1 - $o2] }
   362    if {$res==0} { set res [expr $p1 - $p2] }
   363  
   364    return $res
   365  }
   366  
   367  #-------------------------------------------------------------------------
   368  # Logical operators used by the commands returned by fts5_tcl_expr().
   369  #
   370  proc AND {args} {
   371    foreach a $args {
   372      if {[llength $a]==0} { return [list] }
   373    }
   374    sort_poslist [concat {*}$args]
   375  }
   376  proc OR {args} {
   377    sort_poslist [concat {*}$args]
   378  }
   379  proc NOT {a b} {
   380    if {[llength $b]>0} { return [list] }
   381    return $a
   382  }
   383  
   384  #-------------------------------------------------------------------------
   385  # This command is similar to [split], except that it also provides the
   386  # start and end offsets of each token. For example:
   387  #
   388  #   [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8}
   389  #
   390  
   391  proc gobble_whitespace {textvar} {
   392    upvar $textvar t
   393    regexp {([ ]*)(.*)} $t -> space t
   394    return [string length $space]
   395  }
   396  
   397  proc gobble_text {textvar wordvar} {
   398    upvar $textvar t
   399    upvar $wordvar w
   400    regexp {([^ ]*)(.*)} $t -> w t
   401    return [string length $w]
   402  }
   403  
   404  proc fts5_tokenize_split {text} {
   405    set token ""
   406    set ret [list]
   407    set iOff [gobble_whitespace text]
   408    while {[set nToken [gobble_text text word]]} {
   409      lappend ret $word $iOff [expr $iOff+$nToken]
   410      incr iOff $nToken
   411      incr iOff [gobble_whitespace text]
   412    }
   413  
   414    set ret
   415  }
   416  
   417  #-------------------------------------------------------------------------
   418  #
   419  proc foreach_detail_mode {prefix script} {
   420    set saved $::testprefix
   421    foreach d [list full col none] {
   422      set s [string map [list %DETAIL% $d] $script]
   423      set ::detail $d
   424      set ::testprefix "$prefix-$d"
   425      reset_db
   426      uplevel $s
   427      unset ::detail
   428    }
   429    set ::testprefix $saved
   430  }
   431  
   432  proc detail_check {} {
   433    if {$::detail != "none" && $::detail!="full" && $::detail!="col"} {
   434      error "not in foreach_detail_mode {...} block"
   435    }
   436  }
   437  proc detail_is_none {} { detail_check ; expr {$::detail == "none"} }
   438  proc detail_is_col {}  { detail_check ; expr {$::detail == "col" } }
   439  proc detail_is_full {} { detail_check ; expr {$::detail == "full"} }
   440  
   441  
   442  #-------------------------------------------------------------------------
   443  # Convert a poslist of the type returned by fts5_test_poslist() to a 
   444  # collist as returned by fts5_test_collist().
   445  #
   446  proc fts5_poslist2collist {poslist} {
   447    set res [list]
   448    foreach h $poslist {
   449      regexp {(.*)\.[1234567890]+} $h -> cand
   450      lappend res $cand
   451    }
   452    set res [lsort -command fts5_collist_elem_compare -unique $res]
   453    return $res
   454  }
   455  
   456  # Comparison function used by fts5_poslist2collist to sort collist entries.
   457  proc fts5_collist_elem_compare {a b} {
   458    foreach {a1 a2} [split $a .] {}
   459    foreach {b1 b2} [split $b .] {}
   460  
   461    if {$a1==$b1} { return [expr $a2 - $b2] }
   462    return [expr $a1 - $b1]
   463  }
   464  
   465  
   466  #--------------------------------------------------------------------------
   467  # Construct and return a tcl list equivalent to that returned by the SQL
   468  # query executed against database handle [db]:
   469  #
   470  #   SELECT 
   471  #     rowid, 
   472  #     fts5_test_poslist($tbl),
   473  #     fts5_test_collist($tbl) 
   474  #   FROM $tbl('$expr')
   475  #   ORDER BY rowid $order;
   476  #
   477  proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} {
   478  
   479    # Figure out the set of columns in the FTS5 table. This routine does
   480    # not handle tables with UNINDEXED columns, but if it did, it would
   481    # have to be here.
   482    db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
   483  
   484    set d ""
   485    if {$aDictVar != ""} {
   486      upvar $aDictVar aDict
   487      set d aDict
   488    }
   489  
   490    set cols ""
   491    foreach e $lCols { append cols ", '$e'" }
   492    set tclexpr [db one [subst -novar {
   493      SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] )
   494    }]]
   495  
   496    set res [list]
   497    db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x {
   498      set cols [list]
   499      foreach col $lCols { lappend cols $x($col) }
   500      
   501      set ::pc 0
   502      set rowdata [eval $tclexpr]
   503      if {$rowdata != ""} { 
   504        lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata]
   505      }
   506    }
   507  
   508    set res
   509  }
   510  
   511  #-------------------------------------------------------------------------
   512  # Similar to [fts5_query_data], but omit the collist field.
   513  #
   514  proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} {
   515    set res [list]
   516  
   517    if {$aDictVar!=""} {
   518      upvar $aDictVar aDict
   519      set dict aDict
   520    } else {
   521      set dict ""
   522    }
   523  
   524    foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
   525      lappend res $rowid $poslist
   526    }
   527    set res
   528  }
   529  
   530  proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} {
   531    set res [list]
   532  
   533    if {$aDictVar!=""} {
   534      upvar $aDictVar aDict
   535      set dict aDict
   536    } else {
   537      set dict ""
   538    }
   539  
   540    foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
   541      lappend res $rowid $collist
   542    }
   543    set res
   544  }
   545  
   546  #-------------------------------------------------------------------------
   547  #
   548  
   549  # This command will only work inside a [foreach_detail_mode] block. It tests
   550  # whether or not expression $expr run on FTS5 table $tbl is supported by
   551  # the current mode. If so, 1 is returned. If not, 0.
   552  #
   553  #   detail=full    (all queries supported)
   554  #   detail=col     (all but phrase queries and NEAR queries)
   555  #   detail=none    (all but phrase queries, NEAR queries, and column filters)
   556  #
   557  proc fts5_expr_ok {expr tbl} {
   558  
   559    if {![detail_is_full]} {
   560      set nearset "nearset_rc"
   561      if {[detail_is_col]} { set nearset "nearset_rf" }
   562  
   563      set ::expr_not_ok 0
   564      db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
   565  
   566      set cols ""
   567      foreach e $lCols { append cols ", '$e'" }
   568      set ::pc 0
   569      set tclexpr [db one [subst -novar {
   570        SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] )
   571      }]]
   572      eval $tclexpr
   573      if {$::expr_not_ok} { return 0 }
   574    }
   575  
   576    return 1
   577  }
   578  
   579  # Helper for [fts5_expr_ok]
   580  proc nearset_rf {aCol args} {
   581    set idx [lsearch -exact $args --]
   582    if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} {
   583      set ::expr_not_ok 1
   584    }
   585    list
   586  }
   587  
   588  # Helper for [fts5_expr_ok]
   589  proc nearset_rc {aCol args} {
   590    nearset_rf $aCol {*}$args
   591    if {[lsearch $args -col]>=0} { 
   592      set ::expr_not_ok 1
   593    }
   594    list
   595  }
   596  
   597  
   598  #-------------------------------------------------------------------------
   599  # Code for a simple Tcl tokenizer that supports synonyms at query time.
   600  #
   601  proc tclnum_tokenize {mode tflags text} {
   602    foreach {w iStart iEnd} [fts5_tokenize_split $text] {
   603      sqlite3_fts5_token $w $iStart $iEnd
   604      if {$tflags == $mode && [info exists ::tclnum_syn($w)]} {
   605        foreach s $::tclnum_syn($w)  { sqlite3_fts5_token -colo $s $iStart $iEnd }
   606      }
   607    }
   608  }
   609  
   610  proc tclnum_create {args} {
   611    set mode query
   612    if {[llength $args]} {
   613      set mode [lindex $args 0]
   614    }
   615    if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" }
   616    return [list tclnum_tokenize $mode]
   617  }
   618  
   619  proc fts5_tclnum_register {db} {
   620    foreach SYNDICT {
   621      {zero  0}
   622      {one   1 i}
   623      {two   2 ii}
   624      {three 3 iii}
   625      {four  4 iv}
   626      {five  5 v}
   627      {six   6 vi}
   628      {seven 7 vii}
   629      {eight 8 viii}
   630      {nine  9 ix}
   631  
   632      {a1 a2 a3 a4 a5 a6 a7 a8 a9}
   633      {b1 b2 b3 b4 b5 b6 b7 b8 b9}
   634      {c1 c2 c3 c4 c5 c6 c7 c8 c9}
   635    } {
   636      foreach s $SYNDICT {
   637        set o [list]
   638        foreach x $SYNDICT {if {$x!=$s} {lappend o $x}}
   639        set ::tclnum_syn($s) $o
   640      }
   641    }
   642    sqlite3_fts5_create_tokenizer db tclnum tclnum_create
   643  }
   644  #
   645  # End of tokenizer code.
   646  #-------------------------------------------------------------------------
   647