github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/helpers/tokenizer_test.go

github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/helpers/tokenizer_test.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package helpers
    13  
    14  import (
    15  	"testing"
    16  
    17  	"github.com/stretchr/testify/assert"
    18  	"github.com/weaviate/weaviate/entities/models"
    19  )
    20  
    21  func TestTokenise(t *testing.T) {
    22  	UseGse = true
    23  	init_gse()
    24  	tokens := Tokenize(models.PropertyTokenizationTrigram, "Thequickbrownfoxjumpsoverthelazydog")
    25  	assert.Equal(t, []string{"the", "heq", "equ", "qui", "uic", "ick", "ckb", "kbr", "bro", "row", "own", "wnf", "nfo", "fox", "oxj", "xju", "jum", "ump", "mps", "pso", "sov", "ove", "ver", "ert", "rth", "the", "hel", "ela", "laz", "azy", "zyd", "ydo", "dog"}, tokens)
    26  
    27  	tokens = Tokenize(models.PropertyTokenizationTrigram, "The quick brown fox jumps over the lazy dog")
    28  	assert.Equal(t, []string{"the", "heq", "equ", "qui", "uic", "ick", "ckb", "kbr", "bro", "row", "own", "wnf", "nfo", "fox", "oxj", "xju", "jum", "ump", "mps", "pso", "sov", "ove", "ver", "ert", "rth", "the", "hel", "ela", "laz", "azy", "zyd", "ydo", "dog"}, tokens)
    29  
    30  	tokens = Tokenize(models.PropertyTokenizationTrigram, "いろはにほへとちりぬるをわかよたれそつねならむうゐのおくやまけふこえてあさきゆめみしゑひもせす")
    31  	assert.Equal(t, []string{"いろは", "ろはに", "はにほ", "にほへ", "ほへと", "へとち", "とちり", "ちりぬ", "りぬる", "ぬるを", "るをわ", "をわか", "わかよ", "かよた", "よたれ", "たれそ", "れそつ", "そつね", "つねな", "ねなら", "ならむ", "らむう", "むうゐ", "うゐの", "ゐのお", "のおく", "おくや", "くやま", "やまけ", "まけふ", "けふこ", "ふこえ", "こえて", "えてあ", "てあさ", "あさき", "さきゆ", "きゆめ", "ゆめみ", "めみし", "みしゑ", "しゑひ", "ゑひも", "ひもせ", "もせす"}, tokens)
    32  
    33  	tokens = Tokenize(models.PropertyTokenizationTrigram, `春の夜の夢はうつつよりもかなしき
    34  	夏の夜の夢はうつつに似たり
    35  	秋の夜の夢はうつつを超え
    36  	冬の夜の夢は心に響く
    37  
    38  	山のあなたに小さな村が見える
    39  	川の音が静かに耳に届く
    40  	風が木々を通り抜ける音
    41  	星空の下、すべてが平和である`)
    42  	assert.Equal(t, []string{"春の夜", "の夜の", "夜の夢", "の夢は", "夢はう", "はうつ", "うつつ", "つつよ", "つより", "よりも", "りもか", "もかな", "かなし", "なしき", "しき夏", "き夏の", "夏の夜", "の夜の", "夜の夢", "の夢は", "夢はう", "はうつ", "うつつ", "つつに", "つに似", "に似た", "似たり", "たり秋", "り秋の", "秋の夜", "の夜の", "夜の夢", "の夢は", "夢はう", "はうつ", "うつつ", "つつを", "つを超", "を超え", "超え冬", "え冬の", "冬の夜", "の夜の", "夜の夢", "の夢は", "夢は心", "は心に", "心に響", "に響く", "響く山", "く山の", "山のあ", "のあな", "あなた", "なたに", "たに小", "に小さ", "小さな", "さな村", "な村が", "村が見", "が見え", "見える", "える川", "る川の", "川の音", "の音が", "音が静", "が静か", "静かに", "かに耳", "に耳に", "耳に届", "に届く", "届く風", "く風が", "風が木", "が木々", "木々を", "々を通", "を通り", "通り抜", "り抜け", "抜ける", "ける音", "る音星", "音星空", "星空の", "空の下", "の下す", "下すべ", "すべて", "べてが", "てが平", "が平和", "平和で", "和であ", "である"}, tokens)
    43  
    44  	tokens = Tokenize(models.PropertyTokenizationGse, `春の夜の夢はうつつよりもかなしき
    45  	夏の夜の夢はうつつに似たり
    46  	秋の夜の夢はうつつを超え
    47  	冬の夜の夢は心に響く
    48  
    49  	山のあなたに小さな村が見える
    50  	川の音が静かに耳に届く
    51  	風が木々を通り抜ける音
    52  	星空の下、すべてが平和である`)
    53  	assert.Equal(t, []string{"春の", "夜", "の", "夢", "はう", "うつ", "うつつ", "つつ", "つよ", "より", "も", "かな", "かなし", "かなしき", "なし", "しき", "\n", "\t", "夏", "の", "夜", "の", "夢", "はう", "うつ", "うつつ", "つつ", "に", "似", "たり", "\n", "\t", "秋", "の", "夜", "の", "夢", "はう", "うつ", "うつつ", "つつ", "を", "超え", "\n", "\t", "冬", "の", "夜", "の", "夢", "は", "心", "に", "響く", "\n", "\n", "\t", "山", "の", "あな", "あなた", "に", "小さ", "小さな", "村", "が", "見え", "見える", "える", "\n", "\t", "川", "の", "音", "が", "静か", "かに", "耳", "に", "届く", "\n", "\t", "風", "が", "木々", "を", "通り", "通り抜け", "通り抜ける", "抜け", "抜ける", "ける", "音", "\n", "\t", "星空", "の", "下", "、", "すべ", "すべて", "が", "平和", "で", "ある", "春の夜の夢はうつつよりもかなしき", "夏の夜の夢はうつつに似たり", "秋の夜の夢はうつつを超え", "冬の夜の夢は心に響く", "山のあなたに小さな村が見える", "川の音が静かに耳に届く", "風が木々を通り抜ける音", "星空の下", "すべてが平和である"}, tokens)
    54  
    55  	tokens = Tokenize(models.PropertyTokenizationGse, "素早い茶色の狐が怠けた犬を飛び越えた")
    56  	assert.Equal(t, []string{"素早", "素早い", "早い", "茶色", "の", "狐", "が", "怠け", "けた", "犬", "を", "飛び", "飛び越え", "越え", "た", "素早い茶色の狐が怠けた犬を飛び越えた"}, tokens)
    57  
    58  	tokens = Tokenize(models.PropertyTokenizationGse, "すばやいちゃいろのきつねがなまけたいぬをとびこえた")
    59  	assert.Equal(t, []string{"すばや", "すばやい", "やい", "いち", "ちゃ", "ちゃい", "ちゃいろ", "いろ", "のき", "きつ", "きつね", "つね", "ねが", "がな", "なま", "なまけ", "まけ", "けた", "けたい", "たい", "いぬ", "を", "とび", "とびこえ", "こえ", "た", "すばやいちゃいろのきつねがなまけたいぬをとびこえた"}, tokens)
    60  
    61  	tokens = Tokenize(models.PropertyTokenizationGse, "スバヤイチャイロノキツネガナマケタイヌヲトビコエタ")
    62  	assert.Equal(t, []string{"スバ", "ヤイ", "イチ", "チャイ", "チャイロ", "ノ", "キツ", "キツネ", "ツネ", "ネガ", "ナマ", "ケタ", "タイ", "イヌ", "ヲ", "トビ", "コ", "エ", "タ", "スバヤイチャイロノキツネガナマケタイヌヲトビコエタ"}, tokens)
    63  
    64  	tokens = Tokenize(models.PropertyTokenizationGse, "The quick brown fox jumps over the lazy dog")
    65  	assert.Equal(t, []string{"t", "h", "e", "q", "u", "i", "c", "k", "b", "r", "o", "w", "n", "f", "o", "x", "j", "u", "m", "p", "s", "o", "v", "e", "r", "t", "h", "e", "l", "a", "z", "y", "d", "o", "g", "the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"}, tokens)
    66  }
    67  
    68  func TestTokenize(t *testing.T) {
    69  	input := " Hello You*-beautiful_world?!"
    70  
    71  	type testCase struct {
    72  		tokenization string
    73  		expected     []string
    74  	}
    75  
    76  	t.Run("tokenize", func(t *testing.T) {
    77  		testCases := []testCase{
    78  			{
    79  				tokenization: models.PropertyTokenizationField,
    80  				expected:     []string{"Hello You*-beautiful_world?!"},
    81  			},
    82  			{
    83  				tokenization: models.PropertyTokenizationWhitespace,
    84  				expected:     []string{"Hello", "You*-beautiful_world?!"},
    85  			},
    86  			{
    87  				tokenization: models.PropertyTokenizationLowercase,
    88  				expected:     []string{"hello", "you*-beautiful_world?!"},
    89  			},
    90  			{
    91  				tokenization: models.PropertyTokenizationWord,
    92  				expected:     []string{"hello", "you", "beautiful", "world"},
    93  			},
    94  		}
    95  
    96  		for _, tc := range testCases {
    97  			terms := Tokenize(tc.tokenization, input)
    98  			assert.ElementsMatch(t, tc.expected, terms)
    99  		}
   100  	})
   101  
   102  	t.Run("tokenize with wildcards", func(t *testing.T) {
   103  		testCases := []testCase{
   104  			{
   105  				tokenization: models.PropertyTokenizationField,
   106  				expected:     []string{"Hello You*-beautiful_world?!"},
   107  			},
   108  			{
   109  				tokenization: models.PropertyTokenizationWhitespace,
   110  				expected:     []string{"Hello", "You*-beautiful_world?!"},
   111  			},
   112  			{
   113  				tokenization: models.PropertyTokenizationLowercase,
   114  				expected:     []string{"hello", "you*-beautiful_world?!"},
   115  			},
   116  			{
   117  				tokenization: models.PropertyTokenizationWord,
   118  				expected:     []string{"hello", "you*", "beautiful", "world?"},
   119  			},
   120  		}
   121  
   122  		for _, tc := range testCases {
   123  			terms := TokenizeWithWildcards(tc.tokenization, input)
   124  			assert.ElementsMatch(t, tc.expected, terms)
   125  		}
   126  	})
   127  }
   128  
   129  func TestTokenizeAndCountDuplicates(t *testing.T) {
   130  	input := "Hello You Beautiful World! hello you beautiful world!"
   131  
   132  	type testCase struct {
   133  		tokenization string
   134  		expected     map[string]int
   135  	}
   136  
   137  	testCases := []testCase{
   138  		{
   139  			tokenization: models.PropertyTokenizationField,
   140  			expected: map[string]int{
   141  				"Hello You Beautiful World! hello you beautiful world!": 1,
   142  			},
   143  		},
   144  		{
   145  			tokenization: models.PropertyTokenizationWhitespace,
   146  			expected: map[string]int{
   147  				"Hello":     1,
   148  				"You":       1,
   149  				"Beautiful": 1,
   150  				"World!":    1,
   151  				"hello":     1,
   152  				"you":       1,
   153  				"beautiful": 1,
   154  				"world!":    1,
   155  			},
   156  		},
   157  		{
   158  			tokenization: models.PropertyTokenizationLowercase,
   159  			expected: map[string]int{
   160  				"hello":     2,
   161  				"you":       2,
   162  				"beautiful": 2,
   163  				"world!":    2,
   164  			},
   165  		},
   166  		{
   167  			tokenization: models.PropertyTokenizationWord,
   168  			expected: map[string]int{
   169  				"hello":     2,
   170  				"you":       2,
   171  				"beautiful": 2,
   172  				"world":     2,
   173  			},
   174  		},
   175  	}
   176  
   177  	for _, tc := range testCases {
   178  		t.Run(tc.tokenization, func(t *testing.T) {
   179  			terms, dups := TokenizeAndCountDuplicates(tc.tokenization, input)
   180  
   181  			assert.Len(t, terms, len(tc.expected))
   182  			assert.Len(t, dups, len(tc.expected))
   183  
   184  			for i := range terms {
   185  				assert.Contains(t, tc.expected, terms[i])
   186  				assert.Equal(t, tc.expected[terms[i]], dups[i])
   187  			}
   188  		})
   189  	}
   190  }