github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/tok_test.go (about)

     1  /*
     2   * Copyright 2016-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package tok
    18  
    19  import (
    20  	"math"
    21  	"sort"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/stretchr/testify/require"
    26  )
    27  
    28  type encL struct {
    29  	ints   []int64
    30  	tokens []string
    31  }
    32  
    33  type byEnc struct{ encL }
    34  
    35  func (o byEnc) Less(i, j int) bool { return o.ints[i] < o.ints[j] }
    36  
    37  func (o byEnc) Len() int { return len(o.ints) }
    38  
    39  func (o byEnc) Swap(i, j int) {
    40  	o.ints[i], o.ints[j] = o.ints[j], o.ints[i]
    41  	o.tokens[i], o.tokens[j] = o.tokens[j], o.tokens[i]
    42  }
    43  
    44  func TestIntEncoding(t *testing.T) {
    45  	a := int64(1<<24 + 10)
    46  	b := int64(-1<<24 - 1)
    47  	c := int64(math.MaxInt64)
    48  	d := int64(math.MinInt64)
    49  	enc := encL{}
    50  	arr := []int64{a, b, c, d, 1, 2, 3, 4, -1, -2, -3, 0, 234, 10000, 123, -1543}
    51  	enc.ints = arr
    52  	for _, it := range arr {
    53  		encoded := encodeInt(int64(it))
    54  		enc.tokens = append(enc.tokens, encoded)
    55  	}
    56  	sort.Sort(byEnc{enc})
    57  	for i := 1; i < len(enc.tokens); i++ {
    58  		// The corresponding string tokens should be greater.
    59  		require.True(t, enc.tokens[i-1] < enc.tokens[i], "%d %v vs %d %v",
    60  			enc.ints[i-1], []byte(enc.tokens[i-1]), enc.ints[i], []byte(enc.tokens[i]))
    61  	}
    62  }
    63  
    64  func TestFullTextTokenizer(t *testing.T) {
    65  	tokenizer, has := GetTokenizer("fulltext")
    66  	require.True(t, has)
    67  	require.NotNil(t, tokenizer)
    68  
    69  	tokens, err := BuildTokens("Stemming works!", GetLangTokenizer(tokenizer, "en"))
    70  	require.Nil(t, err)
    71  	require.Equal(t, 2, len(tokens))
    72  	id := tokenizer.Identifier()
    73  	require.Equal(t, []string{encodeToken("stem", id), encodeToken("work", id)}, tokens)
    74  }
    75  
    76  func TestHourTokenizer(t *testing.T) {
    77  	var err error
    78  	tokenizer, has := GetTokenizer("hour")
    79  	require.True(t, has)
    80  	require.NotNil(t, tokenizer)
    81  	dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z")
    82  	require.NoError(t, err)
    83  
    84  	tokens, err := BuildTokens(dt, tokenizer)
    85  	require.NoError(t, err)
    86  	require.Equal(t, 1, len(tokens))
    87  	require.Equal(t, 1+2*4, len(tokens[0]))
    88  }
    89  
    90  func TestDayTokenizer(t *testing.T) {
    91  	var err error
    92  	tokenizer, has := GetTokenizer("day")
    93  	require.True(t, has)
    94  	require.NotNil(t, tokenizer)
    95  	dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z")
    96  	require.NoError(t, err)
    97  
    98  	tokens, err := BuildTokens(dt, tokenizer)
    99  	require.NoError(t, err)
   100  	require.Equal(t, 1, len(tokens))
   101  	require.Equal(t, 1+2*3, len(tokens[0]))
   102  }
   103  
   104  func TestMonthTokenizer(t *testing.T) {
   105  	var err error
   106  	tokenizer, has := GetTokenizer("month")
   107  	require.True(t, has)
   108  	require.NotNil(t, tokenizer)
   109  	dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z")
   110  	require.NoError(t, err)
   111  
   112  	tokens, err := BuildTokens(dt, tokenizer)
   113  	require.NoError(t, err)
   114  	require.Equal(t, 1, len(tokens))
   115  	require.Equal(t, 1+2*2, len(tokens[0]))
   116  }
   117  
   118  func TestDateTimeTokenizer(t *testing.T) {
   119  	var err error
   120  	tokenizer, has := GetTokenizer("year")
   121  	require.True(t, has)
   122  	require.NotNil(t, tokenizer)
   123  	dt, err := time.Parse(time.RFC3339, "2017-01-01T12:12:12Z")
   124  	require.NoError(t, err)
   125  
   126  	tokens, err := BuildTokens(dt, tokenizer)
   127  	require.NoError(t, err)
   128  	require.Equal(t, 1, len(tokens))
   129  	require.Equal(t, 1+2, len(tokens[0]))
   130  }
   131  
   132  func TestFullTextTokenizerLang(t *testing.T) {
   133  	tokenizer, has := GetTokenizer("fulltext")
   134  	require.True(t, has)
   135  	require.NotNil(t, tokenizer)
   136  
   137  	tokens, err := BuildTokens("Katzen und Auffassung und Auffassung", GetLangTokenizer(tokenizer, "de"))
   138  	require.NoError(t, err)
   139  	require.Equal(t, 2, len(tokens))
   140  	id := tokenizer.Identifier()
   141  	// tokens should be sorted and unique
   142  	require.Equal(t, []string{encodeToken("auffassung", id), encodeToken("katz", id)}, tokens)
   143  }
   144  
   145  func TestTermTokenizer(t *testing.T) {
   146  	tokenizer, has := GetTokenizer("term")
   147  	require.True(t, has)
   148  	require.NotNil(t, tokenizer)
   149  
   150  	tokens, err := BuildTokens("Tokenizer works works!", tokenizer)
   151  	require.NoError(t, err)
   152  	require.Equal(t, 2, len(tokens))
   153  	id := tokenizer.Identifier()
   154  	require.Equal(t, []string{encodeToken("tokenizer", id), encodeToken("works", id)}, tokens)
   155  }
   156  
   157  func TestTrigramTokenizer(t *testing.T) {
   158  	tokenizer, has := GetTokenizer("trigram")
   159  	require.True(t, has)
   160  	require.NotNil(t, tokenizer)
   161  	tokens, err := BuildTokens("Dgraph rocks!", tokenizer)
   162  	require.NoError(t, err)
   163  	require.Equal(t, 11, len(tokens))
   164  	id := tokenizer.Identifier()
   165  	expected := []string{
   166  		encodeToken("Dgr", id),
   167  		encodeToken("gra", id),
   168  		encodeToken("rap", id),
   169  		encodeToken("aph", id),
   170  		encodeToken("ph ", id),
   171  		encodeToken("h r", id),
   172  		encodeToken(" ro", id),
   173  		encodeToken("roc", id),
   174  		encodeToken("ock", id),
   175  		encodeToken("cks", id),
   176  		encodeToken("ks!", id),
   177  	}
   178  	sort.Strings(expected)
   179  	require.Equal(t, expected, tokens)
   180  }
   181  
   182  func TestGetFullTextTokens(t *testing.T) {
   183  	val := "Our chief weapon is surprise...surprise and fear...fear and surprise...." +
   184  		"Our two weapons are fear and surprise...and ruthless efficiency.... " +
   185  		"Our three weapons are fear, surprise, and ruthless efficiency..."
   186  	tokens, err := (&FullTextTokenizer{lang: "en"}).Tokens(val)
   187  	require.NoError(t, err)
   188  
   189  	expected := []string{"chief", "weapon", "surpris", "fear", "ruthless", "effici", "two", "three"}
   190  	sort.Strings(expected)
   191  
   192  	// ensure that tokens are sorted and unique
   193  	require.Equal(t, expected, tokens)
   194  }
   195  
   196  func TestGetFullTextTokens1(t *testing.T) {
   197  	tokens, err := GetFullTextTokens([]string{"Quick brown fox"}, "en")
   198  	require.NoError(t, err)
   199  	require.NotNil(t, tokens)
   200  	require.Equal(t, 3, len(tokens))
   201  }
   202  
   203  func TestGetFullTextTokensInvalidLang(t *testing.T) {
   204  	tokens, err := GetFullTextTokens([]string{"Quick brown fox"}, "xxx_such_language")
   205  	require.NoError(t, err)
   206  	require.NotNil(t, tokens)
   207  	require.Equal(t, 3, len(tokens))
   208  }
   209  
   210  // NOTE: The Chinese/Japanese/Korean tests were are based on assuming that the
   211  // output is correct (and adding it to the test), with some verification using
   212  // Google translate.
   213  
   214  func TestFullTextTokenizerCJKChinese(t *testing.T) {
   215  	tokenizer, has := GetTokenizer("fulltext")
   216  	require.True(t, has)
   217  	require.NotNil(t, tokenizer)
   218  
   219  	got, err := BuildTokens("他是一个薪水很高的商人", GetLangTokenizer(tokenizer, "zh"))
   220  	require.NoError(t, err)
   221  
   222  	id := tokenizer.Identifier()
   223  	wantToks := []string{
   224  		encodeToken("一个", id),
   225  		encodeToken("个薪", id),
   226  		encodeToken("他是", id),
   227  		encodeToken("商人", id),
   228  		encodeToken("很高", id),
   229  		encodeToken("是一", id),
   230  		encodeToken("水很", id),
   231  		encodeToken("的商", id),
   232  		encodeToken("薪水", id),
   233  		encodeToken("高的", id),
   234  	}
   235  	require.Equal(t, wantToks, got)
   236  	checkSortedAndUnique(t, got)
   237  }
   238  
   239  func TestFullTextTokenizerCJKKorean(t *testing.T) {
   240  	tokenizer, has := GetTokenizer("fulltext")
   241  	require.True(t, has)
   242  	require.NotNil(t, tokenizer)
   243  
   244  	got, err := BuildTokens("그는 큰 급여를 가진 사업가입니다.", GetLangTokenizer(tokenizer, "ko"))
   245  	require.NoError(t, err)
   246  
   247  	id := tokenizer.Identifier()
   248  	wantToks := []string{
   249  		encodeToken("가진", id),
   250  		encodeToken("그는", id),
   251  		encodeToken("급여를", id),
   252  		encodeToken("사업가입니다", id),
   253  		encodeToken("큰", id),
   254  	}
   255  	require.Equal(t, wantToks, got)
   256  	checkSortedAndUnique(t, got)
   257  }
   258  
   259  func TestFullTextTokenizerCJKJapanese(t *testing.T) {
   260  	tokenizer, has := GetTokenizer("fulltext")
   261  	require.True(t, has)
   262  	require.NotNil(t, tokenizer)
   263  
   264  	got, err := BuildTokens("彼は大きな給与を持つ実業家です", GetLangTokenizer(tokenizer, "ja"))
   265  	require.NoError(t, err)
   266  
   267  	id := tokenizer.Identifier()
   268  	wantToks := []string{
   269  		encodeToken("きな", id),
   270  		encodeToken("つ実", id),
   271  		encodeToken("です", id),
   272  		encodeToken("な給", id),
   273  		encodeToken("は大", id),
   274  		encodeToken("を持", id),
   275  		encodeToken("与を", id),
   276  		encodeToken("大き", id),
   277  		encodeToken("実業", id),
   278  		encodeToken("家で", id),
   279  		encodeToken("彼は", id),
   280  		encodeToken("持つ", id),
   281  		encodeToken("業家", id),
   282  		encodeToken("給与", id),
   283  	}
   284  	require.Equal(t, wantToks, got)
   285  	checkSortedAndUnique(t, got)
   286  }
   287  
   288  func checkSortedAndUnique(t *testing.T, tokens []string) {
   289  	if !sort.StringsAreSorted(tokens) {
   290  		t.Error("tokens were not sorted")
   291  	}
   292  	set := make(map[string]struct{})
   293  	for _, tok := range tokens {
   294  		if _, ok := set[tok]; ok {
   295  			if ok {
   296  				t.Error("tokens are not unique")
   297  			}
   298  		}
   299  		set[tok] = struct{}{}
   300  	}
   301  }