github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/tok_test.go (about) 1 /* 2 * Copyright 2016-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package tok 18 19 import ( 20 "math" 21 "sort" 22 "testing" 23 "time" 24 25 "github.com/stretchr/testify/require" 26 ) 27 28 type encL struct { 29 ints []int64 30 tokens []string 31 } 32 33 type byEnc struct{ encL } 34 35 func (o byEnc) Less(i, j int) bool { return o.ints[i] < o.ints[j] } 36 37 func (o byEnc) Len() int { return len(o.ints) } 38 39 func (o byEnc) Swap(i, j int) { 40 o.ints[i], o.ints[j] = o.ints[j], o.ints[i] 41 o.tokens[i], o.tokens[j] = o.tokens[j], o.tokens[i] 42 } 43 44 func TestIntEncoding(t *testing.T) { 45 a := int64(1<<24 + 10) 46 b := int64(-1<<24 - 1) 47 c := int64(math.MaxInt64) 48 d := int64(math.MinInt64) 49 enc := encL{} 50 arr := []int64{a, b, c, d, 1, 2, 3, 4, -1, -2, -3, 0, 234, 10000, 123, -1543} 51 enc.ints = arr 52 for _, it := range arr { 53 encoded := encodeInt(int64(it)) 54 enc.tokens = append(enc.tokens, encoded) 55 } 56 sort.Sort(byEnc{enc}) 57 for i := 1; i < len(enc.tokens); i++ { 58 // The corresponding string tokens should be greater. 59 require.True(t, enc.tokens[i-1] < enc.tokens[i], "%d %v vs %d %v", 60 enc.ints[i-1], []byte(enc.tokens[i-1]), enc.ints[i], []byte(enc.tokens[i])) 61 } 62 } 63 64 func TestFullTextTokenizer(t *testing.T) { 65 tokenizer, has := GetTokenizer("fulltext") 66 require.True(t, has) 67 require.NotNil(t, tokenizer) 68 69 tokens, err := BuildTokens("Stemming works!", GetLangTokenizer(tokenizer, "en")) 70 require.Nil(t, err) 71 require.Equal(t, 2, len(tokens)) 72 id := tokenizer.Identifier() 73 require.Equal(t, []string{encodeToken("stem", id), encodeToken("work", id)}, tokens) 74 } 75 76 func TestHourTokenizer(t *testing.T) { 77 var err error 78 tokenizer, has := GetTokenizer("hour") 79 require.True(t, has) 80 require.NotNil(t, tokenizer) 81 dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z") 82 require.NoError(t, err) 83 84 tokens, err := BuildTokens(dt, tokenizer) 85 require.NoError(t, err) 86 require.Equal(t, 1, len(tokens)) 87 require.Equal(t, 1+2*4, len(tokens[0])) 88 } 89 90 func TestDayTokenizer(t *testing.T) { 91 var err error 92 tokenizer, has := GetTokenizer("day") 93 require.True(t, has) 94 require.NotNil(t, tokenizer) 95 dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z") 96 require.NoError(t, err) 97 98 tokens, err := BuildTokens(dt, tokenizer) 99 require.NoError(t, err) 100 require.Equal(t, 1, len(tokens)) 101 require.Equal(t, 1+2*3, len(tokens[0])) 102 } 103 104 func TestMonthTokenizer(t *testing.T) { 105 var err error 106 tokenizer, has := GetTokenizer("month") 107 require.True(t, has) 108 require.NotNil(t, tokenizer) 109 dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z") 110 require.NoError(t, err) 111 112 tokens, err := BuildTokens(dt, tokenizer) 113 require.NoError(t, err) 114 require.Equal(t, 1, len(tokens)) 115 require.Equal(t, 1+2*2, len(tokens[0])) 116 } 117 118 func TestDateTimeTokenizer(t *testing.T) { 119 var err error 120 tokenizer, has := GetTokenizer("year") 121 require.True(t, has) 122 require.NotNil(t, tokenizer) 123 dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z") 124 require.NoError(t, err) 125 126 tokens, err := BuildTokens(dt, tokenizer) 127 require.NoError(t, err) 128 require.Equal(t, 1, len(tokens)) 129 require.Equal(t, 1+2, len(tokens[0])) 130 } 131 132 func TestFullTextTokenizerLang(t *testing.T) { 133 tokenizer, has := GetTokenizer("fulltext") 134 require.True(t, has) 135 require.NotNil(t, tokenizer) 136 137 tokens, err := BuildTokens("Katzen und Auffassung und Auffassung", GetLangTokenizer(tokenizer, "de")) 138 require.NoError(t, err) 139 require.Equal(t, 2, len(tokens)) 140 id := tokenizer.Identifier() 141 // tokens should be sorted and unique 142 require.Equal(t, []string{encodeToken("auffassung", id), encodeToken("katz", id)}, tokens) 143 } 144 145 func TestTermTokenizer(t *testing.T) { 146 tokenizer, has := GetTokenizer("term") 147 require.True(t, has) 148 require.NotNil(t, tokenizer) 149 150 tokens, err := BuildTokens("Tokenizer works works!", tokenizer) 151 require.NoError(t, err) 152 require.Equal(t, 2, len(tokens)) 153 id := tokenizer.Identifier() 154 require.Equal(t, []string{encodeToken("tokenizer", id), encodeToken("works", id)}, tokens) 155 } 156 157 func TestTrigramTokenizer(t *testing.T) { 158 tokenizer, has := GetTokenizer("trigram") 159 require.True(t, has) 160 require.NotNil(t, tokenizer) 161 tokens, err := BuildTokens("Dgraph rocks!", tokenizer) 162 require.NoError(t, err) 163 require.Equal(t, 11, len(tokens)) 164 id := tokenizer.Identifier() 165 expected := []string{ 166 encodeToken("Dgr", id), 167 encodeToken("gra", id), 168 encodeToken("rap", id), 169 encodeToken("aph", id), 170 encodeToken("ph ", id), 171 encodeToken("h r", id), 172 encodeToken(" ro", id), 173 encodeToken("roc", id), 174 encodeToken("ock", id), 175 encodeToken("cks", id), 176 encodeToken("ks!", id), 177 } 178 sort.Strings(expected) 179 require.Equal(t, expected, tokens) 180 } 181 182 func TestGetFullTextTokens(t *testing.T) { 183 val := "Our chief weapon is surprise...surprise and fear...fear and surprise...." + 184 "Our two weapons are fear and surprise...and ruthless efficiency.... " + 185 "Our three weapons are fear, surprise, and ruthless efficiency..." 186 tokens, err := (&FullTextTokenizer{lang: "en"}).Tokens(val) 187 require.NoError(t, err) 188 189 expected := []string{"chief", "weapon", "surpris", "fear", "ruthless", "effici", "two", "three"} 190 sort.Strings(expected) 191 192 // ensure that tokens are sorted and unique 193 require.Equal(t, expected, tokens) 194 } 195 196 func TestGetFullTextTokens1(t *testing.T) { 197 tokens, err := GetFullTextTokens([]string{"Quick brown fox"}, "en") 198 require.NoError(t, err) 199 require.NotNil(t, tokens) 200 require.Equal(t, 3, len(tokens)) 201 } 202 203 func TestGetFullTextTokensInvalidLang(t *testing.T) { 204 tokens, err := GetFullTextTokens([]string{"Quick brown fox"}, "xxx_such_language") 205 require.NoError(t, err) 206 require.NotNil(t, tokens) 207 require.Equal(t, 3, len(tokens)) 208 } 209 210 // NOTE: The Chinese/Japanese/Korean tests were are based on assuming that the 211 // output is correct (and adding it to the test), with some verification using 212 // Google translate. 213 214 func TestFullTextTokenizerCJKChinese(t *testing.T) { 215 tokenizer, has := GetTokenizer("fulltext") 216 require.True(t, has) 217 require.NotNil(t, tokenizer) 218 219 got, err := BuildTokens("他是一个薪水很高的商人", GetLangTokenizer(tokenizer, "zh")) 220 require.NoError(t, err) 221 222 id := tokenizer.Identifier() 223 wantToks := []string{ 224 encodeToken("一个", id), 225 encodeToken("个薪", id), 226 encodeToken("他是", id), 227 encodeToken("商人", id), 228 encodeToken("很高", id), 229 encodeToken("是一", id), 230 encodeToken("水很", id), 231 encodeToken("的商", id), 232 encodeToken("薪水", id), 233 encodeToken("高的", id), 234 } 235 require.Equal(t, wantToks, got) 236 checkSortedAndUnique(t, got) 237 } 238 239 func TestFullTextTokenizerCJKKorean(t *testing.T) { 240 tokenizer, has := GetTokenizer("fulltext") 241 require.True(t, has) 242 require.NotNil(t, tokenizer) 243 244 got, err := BuildTokens("그는 큰 급여를 가진 사업가입니다.", GetLangTokenizer(tokenizer, "ko")) 245 require.NoError(t, err) 246 247 id := tokenizer.Identifier() 248 wantToks := []string{ 249 encodeToken("가진", id), 250 encodeToken("그는", id), 251 encodeToken("급여를", id), 252 encodeToken("사업가입니다", id), 253 encodeToken("큰", id), 254 } 255 require.Equal(t, wantToks, got) 256 checkSortedAndUnique(t, got) 257 } 258 259 func TestFullTextTokenizerCJKJapanese(t *testing.T) { 260 tokenizer, has := GetTokenizer("fulltext") 261 require.True(t, has) 262 require.NotNil(t, tokenizer) 263 264 got, err := BuildTokens("彼は大きな給与を持つ実業家です", GetLangTokenizer(tokenizer, "ja")) 265 require.NoError(t, err) 266 267 id := tokenizer.Identifier() 268 wantToks := []string{ 269 encodeToken("きな", id), 270 encodeToken("つ実", id), 271 encodeToken("です", id), 272 encodeToken("な給", id), 273 encodeToken("は大", id), 274 encodeToken("を持", id), 275 encodeToken("与を", id), 276 encodeToken("大き", id), 277 encodeToken("実業", id), 278 encodeToken("家で", id), 279 encodeToken("彼は", id), 280 encodeToken("持つ", id), 281 encodeToken("業家", id), 282 encodeToken("給与", id), 283 } 284 require.Equal(t, wantToks, got) 285 checkSortedAndUnique(t, got) 286 } 287 288 func checkSortedAndUnique(t *testing.T, tokens []string) { 289 if !sort.StringsAreSorted(tokens) { 290 t.Error("tokens were not sorted") 291 } 292 set := make(map[string]struct{}) 293 for _, tok := range tokens { 294 if _, ok := set[tok]; ok { 295 if ok { 296 t.Error("tokens are not unique") 297 } 298 } 299 set[tok] = struct{}{} 300 } 301 }