github.com/jdgcs/sqlite3@v1.12.1-0.20210908114423-bc5f96e4dd51/testdata/tcl/fts3rnd.test

github.com/jdgcs/sqlite3@v1.12.1-0.20210908114423-bc5f96e4dd51/testdata/tcl/fts3rnd.test (about)

     1  # 2009 December 03
     2  #
     3  #    May you do good and not evil.
     4  #    May you find forgiveness for yourself and forgive others.
     5  #    May you share freely, never taking more than you give.
     6  #
     7  #***********************************************************************
     8  #
     9  # Brute force (random data) tests for FTS3.
    10  #
    11  
    12  #-------------------------------------------------------------------------
    13  #
    14  # The FTS3 tests implemented in this file focus on testing that FTS3
    15  # returns the correct set of documents for various types of full-text
    16  # query. This is done using pseudo-randomly generated data and queries.
    17  # The expected result of each query is calculated using Tcl code.
    18  #
    19  #   1. The database is initialized to contain a single table with three
    20  #      columns. 100 rows are inserted into the table. Each of the three
    21  #      values in each row is a document consisting of between 0 and 100
    22  #      terms. Terms are selected from a vocabulary of $G(nVocab) terms.
    23  #
    24  #   2. The following is performed 100 times:
    25  #
    26  #      a. A row is inserted into the database. The row contents are 
    27  #         generated as in step 1. The docid is a pseudo-randomly selected
    28  #         value between 0 and 1000000.
    29  # 
    30  #      b. A psuedo-randomly selected row is updated. One of its columns is
    31  #         set to contain a new document generated in the same way as the
    32  #         documents in step 1.
    33  # 
    34  #      c. A psuedo-randomly selected row is deleted.
    35  # 
    36  #      d. For each of several types of fts3 queries, 10 SELECT queries
    37  #         of the form:
    38  # 
    39  #           SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
    40  # 
    41  #         are evaluated. The results are compared to those calculated by
    42  #         Tcl code in this file. The patterns used for the different query
    43  #         types are:
    44  # 
    45  #           1.  query = <term>
    46  #           2.  query = <prefix>
    47  #           3.  query = "<term> <term>"
    48  #           4.  query = "<term> <term> <term>"
    49  #           5.  query = "<prefix> <prefix> <prefix>"
    50  #           6.  query = <term> NEAR <term>
    51  #           7.  query = <term> NEAR/11 <term> NEAR/11 <term>
    52  #           8.  query = <term> OR <term>
    53  #           9.  query = <term> NOT <term>
    54  #           10. query = <term> AND <term>
    55  #           11. query = <term> NEAR <term> OR <term> NEAR <term>
    56  #           12. query = <term> NEAR <term> NOT <term> NEAR <term>
    57  #           13. query = <term> NEAR <term> AND <term> NEAR <term>
    58  # 
    59  #         where <term> is a term psuedo-randomly selected from the vocabulary
    60  #         and prefix is the first 2 characters of such a term followed by
    61  #         a "*" character.
    62  #     
    63  #      Every second iteration, steps (a) through (d) above are performed
    64  #      within a single transaction. This forces the queries in (d) to
    65  #      read data from both the database and the in-memory hash table
    66  #      that caches the full-text index entries created by steps (a), (b)
    67  #      and (c) until the transaction is committed.
    68  #
    69  # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
    70  # 500, 1000 and 2000 bytes.
    71  #
    72  # After the test using an advisory node-size of 50, an OOM test is run using
    73  # the database. This test is similar to step (d) above, except that it tests
    74  # the effects of transient and persistent OOM conditions encountered while
    75  # executing each query.
    76  #
    77  
    78  set testdir [file dirname $argv0]
    79  source $testdir/tester.tcl
    80  
    81  # If this build does not include FTS3, skip the tests in this file.
    82  #
    83  ifcapable !fts3 { finish_test ; return }
    84  source $testdir/fts3_common.tcl
    85  source $testdir/malloc_common.tcl
    86  
    87  set G(nVocab) 100
    88  
    89  set nVocab 100
    90  set lVocab [list]
    91  
    92  expr srand(0)
    93  
    94  # Generate a vocabulary of nVocab words. Each word is 3 characters long.
    95  #
    96  set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
    97  for {set i 0} {$i < $nVocab} {incr i} {
    98    set len [expr int(rand()*3)+2]
    99    set    word [lindex $lChar [expr int(rand()*26)]]
   100    append word [lindex $lChar [expr int(rand()*26)]]
   101    if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
   102    if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
   103    lappend lVocab $word
   104  }
   105  
   106  proc random_term {} {
   107    lindex $::lVocab [expr {int(rand()*$::nVocab)}]
   108  }
   109  
   110  # Return a document consisting of $nWord arbitrarily selected terms
   111  # from the $::lVocab list.
   112  #
   113  proc generate_doc {nWord} {
   114    set doc [list]
   115    for {set i 0} {$i < $nWord} {incr i} {
   116      lappend doc [random_term]
   117    }
   118    return $doc
   119  }
   120  
   121  
   122  
   123  # Primitives to update the table.
   124  #
   125  unset -nocomplain t1
   126  proc insert_row {rowid} {
   127    set a [generate_doc [expr int((rand()*100))]]
   128    set b [generate_doc [expr int((rand()*100))]]
   129    set c [generate_doc [expr int((rand()*100))]]
   130    execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
   131    set ::t1($rowid) [list $a $b $c]
   132  }
   133  proc delete_row {rowid} {
   134    execsql { DELETE FROM t1 WHERE rowid = $rowid }
   135    catch {unset ::t1($rowid)}
   136  }
   137  proc update_row {rowid} {
   138    set cols {a b c}
   139    set iCol [expr int(rand()*3)]
   140    set doc  [generate_doc [expr int((rand()*100))]]
   141    lset ::t1($rowid) $iCol $doc
   142    execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
   143  }
   144  
   145  proc simple_phrase {zPrefix} {
   146    set ret [list]
   147  
   148    set reg [string map {* {[^ ]*}} $zPrefix]
   149    set reg " $reg "
   150  
   151    foreach key [lsort -integer [array names ::t1]] {
   152      set value $::t1($key)
   153      set cnt [list]
   154      foreach col $value {
   155        if {[regexp $reg " $col "]} { lappend ret $key ; break }
   156      }
   157    }
   158  
   159    #lsort -uniq -integer $ret
   160    set ret
   161  }
   162  
   163  # This [proc] is used to test the FTS3 matchinfo() function.
   164  # 
   165  proc simple_token_matchinfo {zToken bDesc} {
   166  
   167    set nDoc(0) 0
   168    set nDoc(1) 0
   169    set nDoc(2) 0
   170    set nHit(0) 0
   171    set nHit(1) 0
   172    set nHit(2) 0
   173  
   174    set dir -inc
   175    if {$bDesc} { set dir -dec }
   176  
   177    foreach key [array names ::t1] {
   178      set value $::t1($key)
   179      set a($key) [list]
   180      foreach i {0 1 2} col $value {
   181        set hit [llength [lsearch -all $col $zToken]]
   182        lappend a($key) $hit
   183        incr nHit($i) $hit
   184        if {$hit>0} { incr nDoc($i) }
   185      }
   186    }
   187  
   188    set ret [list]
   189    foreach docid [lsort -integer $dir [array names a]] {
   190      if { [lindex [lsort -integer $a($docid)] end] } {
   191        set matchinfo [list 1 3]
   192        foreach i {0 1 2} hit $a($docid) {
   193          lappend matchinfo $hit $nHit($i) $nDoc($i)
   194        }
   195        lappend ret $docid $matchinfo
   196      }
   197    }
   198  
   199    set ret
   200  } 
   201  
   202  proc simple_near {termlist nNear} {
   203    set ret [list]
   204  
   205    foreach {key value} [array get ::t1] {
   206      foreach v $value {
   207  
   208        set l [lsearch -exact -all $v [lindex $termlist 0]]
   209        foreach T [lrange $termlist 1 end] {
   210          set l2 [list]
   211          foreach i $l {
   212            set iStart [expr $i - $nNear - 1]
   213            set iEnd [expr $i + $nNear + 1]
   214            if {$iStart < 0} {set iStart 0}
   215            foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
   216              incr i2 $iStart
   217              if {$i2 != $i} { lappend l2 $i2 } 
   218            }
   219          }
   220          set l [lsort -uniq -integer $l2]
   221        }
   222  
   223        if {[llength $l]} {
   224  #puts "MATCH($key): $v"
   225          lappend ret $key
   226        } 
   227      }
   228    }
   229  
   230    lsort -unique -integer $ret
   231  }
   232  
   233  # The following three procs:
   234  # 
   235  #   setup_not A B
   236  #   setup_or  A B
   237  #   setup_and A B
   238  #
   239  # each take two arguments. Both arguments must be lists of integer values
   240  # sorted by value. The return value is the list produced by evaluating
   241  # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
   242  # AND.
   243  #
   244  proc setop_not {A B} {
   245    foreach b $B { set n($b) {} }
   246    set ret [list]
   247    foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
   248    return $ret
   249  }
   250  proc setop_or {A B} {
   251    lsort -integer -uniq [concat $A $B]
   252  }
   253  proc setop_and {A B} {
   254    foreach b $B { set n($b) {} }
   255    set ret [list]
   256    foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
   257    return $ret
   258  }
   259  
   260  proc mit {blob} {
   261    set scan(littleEndian) i*
   262    set scan(bigEndian) I*
   263    binary scan $blob $scan($::tcl_platform(byteOrder)) r
   264    return $r
   265  }
   266  db func mit mit
   267  set sqlite_fts3_enable_parentheses 1
   268  
   269  proc do_orderbydocid_test {tn sql res} {
   270    uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
   271    uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
   272      [lsort -int -dec $res]
   273    ]
   274  }
   275  
   276  set NUM_TRIALS 100
   277  
   278  foreach {nodesize order} {
   279    50    DESC
   280    50    ASC
   281    500   ASC
   282    1000  DESC
   283    2000  ASC
   284  } {
   285    catch { array unset ::t1 }
   286    set testname "$nodesize/$order"
   287  
   288    # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
   289    #
   290    db transaction {
   291      catchsql { DROP TABLE t1 }
   292      execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
   293      execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
   294      for {set i 0} {$i < 100} {incr i} { insert_row $i }
   295    }
   296    
   297    for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
   298      catchsql COMMIT
   299  
   300      set DO_MALLOC_TEST 0
   301      set nRep 10
   302      if {$iTest==100 && $nodesize==50} { 
   303        set DO_MALLOC_TEST 1 
   304        set nRep 2
   305      }
   306  
   307      set ::testprefix fts3rnd-1.$testname.$iTest
   308    
   309      # Delete one row, update one row and insert one row.
   310      #
   311      set rows [array names ::t1]
   312      set nRow [llength $rows]
   313      set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
   314      set iDelete $iUpdate
   315      while {$iDelete == $iUpdate} {
   316        set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
   317      }
   318      set iInsert $iUpdate
   319      while {[info exists ::t1($iInsert)]} {
   320        set iInsert [expr {int(rand()*1000000)}]
   321      }
   322      execsql BEGIN
   323        insert_row $iInsert
   324        update_row $iUpdate
   325        delete_row $iDelete
   326      if {0==($iTest%2)} { execsql COMMIT }
   327  
   328      if {0==($iTest%2)} { 
   329        #do_test 0 { fts3_integrity_check t1 } ok 
   330      }
   331  
   332      # Pick 10 terms from the vocabulary. Check that the results of querying
   333      # the database for the set of documents containing each of these terms
   334      # is the same as the result obtained by scanning the contents of the Tcl 
   335      # array for each term.
   336      #
   337      for {set i 0} {$i < 10} {incr i} {
   338        set term [random_term]
   339        do_select_test 1.$i.asc {
   340          SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
   341          ORDER BY docid ASC
   342        } [simple_token_matchinfo $term 0]
   343        do_select_test 1.$i.desc {
   344          SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
   345          ORDER BY docid DESC
   346        } [simple_token_matchinfo $term 1]
   347      }
   348  
   349      # This time, use the first two characters of each term as a term prefix
   350      # to query for. Test that querying the Tcl array produces the same results
   351      # as querying the FTS3 table for the prefix.
   352      #
   353      for {set i 0} {$i < $nRep} {incr i} {
   354        set prefix [string range [random_term] 0 end-1]
   355        set match "${prefix}*"
   356        do_orderbydocid_test 2.$i {
   357          SELECT docid FROM t1 WHERE t1 MATCH $match
   358        } [simple_phrase $match]
   359      }
   360  
   361      # Similar to the above, except for phrase queries.
   362      #
   363      for {set i 0} {$i < $nRep} {incr i} {
   364        set term [list [random_term] [random_term]]
   365        set match "\"$term\""
   366        do_orderbydocid_test 3.$i {
   367          SELECT docid FROM t1 WHERE t1 MATCH $match
   368        } [simple_phrase $term]
   369      }
   370  
   371      # Three word phrases.
   372      #
   373      for {set i 0} {$i < $nRep} {incr i} {
   374        set term [list [random_term] [random_term] [random_term]]
   375        set match "\"$term\""
   376        do_orderbydocid_test 4.$i {
   377          SELECT docid FROM t1 WHERE t1 MATCH $match
   378        } [simple_phrase $term]
   379      }
   380  
   381      # Three word phrases made up of term-prefixes.
   382      #
   383      for {set i 0} {$i < $nRep} {incr i} {
   384        set    query "[string range [random_term] 0 end-1]* "
   385        append query "[string range [random_term] 0 end-1]* "
   386        append query "[string range [random_term] 0 end-1]*"
   387  
   388        set match "\"$query\""
   389        do_orderbydocid_test 5.$i {
   390          SELECT docid FROM t1 WHERE t1 MATCH $match
   391        } [simple_phrase $query]
   392      }
   393  
   394      # A NEAR query with terms as the arguments:
   395      #
   396      #     ... MATCH '$term1 NEAR $term2' ...
   397      #
   398      for {set i 0} {$i < $nRep} {incr i} {
   399        set terms [list [random_term] [random_term]]
   400        set match [join $terms " NEAR "]
   401        do_orderbydocid_test 6.$i {
   402          SELECT docid FROM t1 WHERE t1 MATCH $match 
   403        } [simple_near $terms 10]
   404      }
   405  
   406      # A 3-way NEAR query with terms as the arguments.
   407      #
   408      for {set i 0} {$i < $nRep} {incr i} {
   409        set terms [list [random_term] [random_term] [random_term]]
   410        set nNear 11
   411        set match [join $terms " NEAR/$nNear "]
   412        do_orderbydocid_test 7.$i {
   413          SELECT docid FROM t1 WHERE t1 MATCH $match
   414        } [simple_near $terms $nNear]
   415      }
   416      
   417      # Set operations on simple term queries.
   418      #
   419      foreach {tn op proc} {
   420        8  OR  setop_or
   421        9  NOT setop_not
   422        10 AND setop_and
   423      } {
   424        for {set i 0} {$i < $nRep} {incr i} {
   425          set term1 [random_term]
   426          set term2 [random_term]
   427          set match "$term1 $op $term2"
   428          do_orderbydocid_test $tn.$i {
   429            SELECT docid FROM t1 WHERE t1 MATCH $match
   430          } [$proc [simple_phrase $term1] [simple_phrase $term2]]
   431        }
   432      }
   433   
   434      # Set operations on NEAR queries.
   435      #
   436      foreach {tn op proc} {
   437        11 OR  setop_or
   438        12 NOT setop_not
   439        13 AND setop_and
   440      } {
   441        for {set i 0} {$i < $nRep} {incr i} {
   442          set term1 [random_term]
   443          set term2 [random_term]
   444          set term3 [random_term]
   445          set term4 [random_term]
   446          set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
   447          do_orderbydocid_test $tn.$i {
   448            SELECT docid FROM t1 WHERE t1 MATCH $match
   449          } [$proc                                  \
   450              [simple_near [list $term1 $term2] 10] \
   451              [simple_near [list $term3 $term4] 10]
   452            ]
   453        }
   454      }
   455  
   456      catchsql COMMIT
   457    }
   458  }
   459  
   460  finish_test