gitlab.com/CoiaPrant/sqlite3@v1.19.1/testdata/tcl/fts2token.test (about)

     1  # 2007 June 21
     2  #
     3  # The author disclaims copyright to this source code.  In place of
     4  # a legal notice, here is a blessing:
     5  #
     6  #    May you do good and not evil.
     7  #    May you find forgiveness for yourself and forgive others.
     8  #    May you share freely, never taking more than you give.
     9  #
    10  #*************************************************************************
    11  # This file implements regression tests for SQLite library. The focus 
    12  # of this script is testing the pluggable tokeniser feature of the 
    13  # FTS2 module.
    14  #
    15  # $Id: fts2token.test,v 1.3 2007/06/25 12:05:40 danielk1977 Exp $
    16  #
    17  
    18  set testdir [file dirname $argv0]
    19  source $testdir/tester.tcl
    20  
    21  # If SQLITE_ENABLE_FTS2 is defined, omit this file.
    22  ifcapable !fts2 {
    23    finish_test
    24    return
    25  }
    26  
    27  proc escape_string {str} {
    28    set out ""
    29    foreach char [split $str ""] {
    30      scan $char %c i
    31      if {$i<=127} {
    32        append out $char
    33      } else {
    34        append out [format {\x%.4x} $i]
    35      }
    36    }
    37    set out
    38  }
    39  
    40  #--------------------------------------------------------------------------
    41  # Test cases fts2token-1.* are the warm-body test for the SQL scalar
    42  # function fts2_tokenizer(). The procedure is as follows:
    43  #
    44  #   1: Verify that there is no such fts2 tokenizer as 'blah'.
    45  #
    46  #   2: Query for the built-in tokenizer 'simple'. Insert a copy of the
    47  #      retrieved value as tokenizer 'blah'.
    48  #
    49  #   3: Test that the value returned for tokenizer 'blah' is now the
    50  #      same as that retrieved for 'simple'.
    51  #
    52  #   4: Test that it is now possible to create an fts2 table using 
    53  #      tokenizer 'blah' (it was not possible in step 1).
    54  #
    55  #   5: Test that the table created to use tokenizer 'blah' is usable.
    56  #
    57  do_test fts2token-1.1 {
    58    catchsql {
    59      CREATE VIRTUAL TABLE t1 USING fts2(content, tokenize blah);
    60    }
    61  } {1 {unknown tokenizer: blah}}
    62  do_test fts2token-1.2 {
    63    execsql {
    64      SELECT fts2_tokenizer('blah', fts2_tokenizer('simple')) IS NULL;
    65    }
    66  } {0}
    67  do_test fts2token-1.3 {
    68    execsql {
    69      SELECT fts2_tokenizer('blah') == fts2_tokenizer('simple');
    70    }
    71  } {1}
    72  do_test fts2token-1.4 {
    73    catchsql {
    74      CREATE VIRTUAL TABLE t1 USING fts2(content, tokenize blah);
    75    }
    76  } {0 {}}
    77  do_test fts2token-1.5 {
    78    execsql {
    79      INSERT INTO t1(content) VALUES('There was movement at the station');
    80      INSERT INTO t1(content) VALUES('For the word has passed around');
    81      INSERT INTO t1(content) VALUES('That the colt from ol regret had got away');
    82      SELECT content FROM t1 WHERE content MATCH 'movement'
    83    }
    84  } {{There was movement at the station}}
    85  
    86  #--------------------------------------------------------------------------
    87  # Test cases fts2token-2.* test error cases in the scalar function based
    88  # API for getting and setting tokenizers.
    89  #
    90  do_test fts2token-2.1 {
    91    catchsql {
    92      SELECT fts2_tokenizer('nosuchtokenizer');
    93    }
    94  } {1 {unknown tokenizer: nosuchtokenizer}}
    95  
    96  #--------------------------------------------------------------------------
    97  # Test cases fts2token-3.* test the three built-in tokenizers with a
    98  # simple input string via the built-in test function. This is as much
    99  # to test the test function as the tokenizer implementations.
   100  #
   101  do_test fts2token-3.1 {
   102    execsql {
   103      SELECT fts2_tokenizer_test('simple', 'I don''t see how');
   104    }
   105  } {{0 i I 1 don don 2 t t 3 see see 4 how how}}
   106  do_test fts2token-3.2 {
   107    execsql {
   108      SELECT fts2_tokenizer_test('porter', 'I don''t see how');
   109    }
   110  } {{0 i I 1 don don 2 t t 3 see see 4 how how}}
   111  ifcapable icu {
   112    do_test fts2token-3.3 {
   113      execsql {
   114        SELECT fts2_tokenizer_test('icu', 'I don''t see how');
   115      }
   116    } {{0 i I 1 don't don't 2 see see 3 how how}}
   117  }
   118  
   119  #--------------------------------------------------------------------------
   120  # Test cases fts2token-4.* test the ICU tokenizer. In practice, this
   121  # tokenizer only has two modes - "thai" and "everybody else". Some other
   122  # Asian languages (Lao, Khmer etc.) require the same special treatment as 
   123  # Thai, but ICU doesn't support them yet.
   124  #
   125  ifcapable icu {
   126  
   127    proc do_icu_test {name locale input output} {
   128      set ::out [db eval { SELECT fts2_tokenizer_test('icu', $locale, $input) }]
   129      do_test $name {
   130        lindex $::out 0
   131      } $output
   132    }
   133    
   134    do_icu_test fts2token-4.1 en_US  {}   {}
   135    do_icu_test fts2token-4.2 en_US {Test cases fts2} [list \
   136      0 test Test 1 cases cases 2 fts2 fts2
   137    ]
   138  
   139    # The following test shows that ICU is smart enough to recognise
   140    # Thai chararacters, even when the locale is set to English/United 
   141    # States.
   142    #
   143    set input "\u0e2d\u0e30\u0e44\u0e23\u0e19\u0e30\u0e04\u0e23\u0e31\u0e1a"
   144    set output    "0 \u0e2d\u0e30\u0e44\u0e23 \u0e2d\u0e30\u0e44\u0e23 "
   145    append output "1 \u0e19\u0e30 \u0e19\u0e30 "
   146    append output "2 \u0e04\u0e23\u0e31\u0e1a \u0e04\u0e23\u0e31\u0e1a"
   147  
   148    do_icu_test fts2token-4.3 th_TH  $input $output
   149    do_icu_test fts2token-4.4 en_US  $input $output
   150  
   151    # ICU handles an unknown locale by falling back to the default.
   152    # So this is not an error.
   153    do_icu_test fts2token-4.5 MiddleOfTheOcean  $input $output
   154  
   155    set    longtoken "AReallyReallyLongTokenOneThatWillSurelyRequire"
   156    append longtoken "AReallocInTheIcuTokenizerCode"
   157  
   158    set    input "short tokens then "
   159    append input $longtoken
   160    set    output "0 short short "
   161    append output "1 tokens tokens "
   162    append output "2 then then "
   163    append output "3 [string tolower $longtoken] $longtoken"
   164  
   165    do_icu_test fts2token-4.6 MiddleOfTheOcean  $input $output
   166    do_icu_test fts2token-4.7 th_TH  $input $output
   167    do_icu_test fts2token-4.8 en_US  $input $output
   168  }
   169  
   170  do_test fts2token-internal {
   171    execsql { SELECT fts2_tokenizer_internal_test() }
   172  } {ok}
   173  
   174  finish_test