github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/stopwords_test.go (about)

     1  /*
     2   * Copyright 2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package tok
    18  
    19  import (
    20  	"testing"
    21  
    22  	"github.com/blevesearch/bleve/analysis"
    23  	"github.com/stretchr/testify/require"
    24  )
    25  
    26  func TestFilterStopwords(t *testing.T) {
    27  	tests := []struct {
    28  		lang string
    29  		in   analysis.TokenStream
    30  		out  analysis.TokenStream
    31  	}{
    32  		{lang: "en",
    33  			in: analysis.TokenStream{
    34  				&analysis.Token{Term: []byte("the")},
    35  				&analysis.Token{Term: []byte("quick")},
    36  				&analysis.Token{Term: []byte("brown")},
    37  				&analysis.Token{Term: []byte("foxes")},
    38  				&analysis.Token{Term: []byte("jump")},
    39  				&analysis.Token{Term: []byte("over")},
    40  				&analysis.Token{Term: []byte("the")},
    41  				&analysis.Token{Term: []byte("big")},
    42  				&analysis.Token{Term: []byte("dogs")},
    43  			},
    44  			out: analysis.TokenStream{
    45  				&analysis.Token{Term: []byte("quick")},
    46  				&analysis.Token{Term: []byte("brown")},
    47  				&analysis.Token{Term: []byte("foxes")},
    48  				&analysis.Token{Term: []byte("jump")},
    49  				&analysis.Token{Term: []byte("big")},
    50  				&analysis.Token{Term: []byte("dogs")},
    51  			},
    52  		},
    53  		{lang: "es",
    54  			in: analysis.TokenStream{
    55  				&analysis.Token{Term: []byte("deseándoles")},
    56  				&analysis.Token{Term: []byte("muchas")},
    57  				&analysis.Token{Term: []byte("alegrías")},
    58  				&analysis.Token{Term: []byte("a")},
    59  				&analysis.Token{Term: []byte("las")},
    60  				&analysis.Token{Term: []byte("señoritas")},
    61  				&analysis.Token{Term: []byte("y")},
    62  				&analysis.Token{Term: []byte("los")},
    63  				&analysis.Token{Term: []byte("señores")},
    64  				&analysis.Token{Term: []byte("programadores")},
    65  				&analysis.Token{Term: []byte("de")},
    66  				&analysis.Token{Term: []byte("Dgraph")},
    67  			},
    68  			out: analysis.TokenStream{
    69  				&analysis.Token{Term: []byte("deseándoles")},
    70  				&analysis.Token{Term: []byte("muchas")},
    71  				&analysis.Token{Term: []byte("alegrías")},
    72  				&analysis.Token{Term: []byte("señoritas")},
    73  				&analysis.Token{Term: []byte("señores")},
    74  				&analysis.Token{Term: []byte("programadores")},
    75  				&analysis.Token{Term: []byte("Dgraph")},
    76  			},
    77  		},
    78  		{lang: "x-klingon",
    79  			in: analysis.TokenStream{
    80  				&analysis.Token{Term: []byte("tlhIngan")},
    81  				&analysis.Token{Term: []byte("maH!")},
    82  			},
    83  			out: analysis.TokenStream{
    84  				&analysis.Token{Term: []byte("tlhIngan")},
    85  				&analysis.Token{Term: []byte("maH!")},
    86  			},
    87  		},
    88  		{lang: "en",
    89  			in: analysis.TokenStream{
    90  				&analysis.Token{
    91  					Term: []byte("same"),
    92  				},
    93  			},
    94  			out: analysis.TokenStream{},
    95  		},
    96  		{lang: "en",
    97  			in:  analysis.TokenStream{},
    98  			out: analysis.TokenStream{},
    99  		},
   100  		{lang: "",
   101  			in: analysis.TokenStream{
   102  				&analysis.Token{
   103  					Term: []byte(""),
   104  				},
   105  			},
   106  			out: analysis.TokenStream{
   107  				&analysis.Token{
   108  					Term: []byte(""),
   109  				},
   110  			},
   111  		},
   112  	}
   113  
   114  	for _, tc := range tests {
   115  		out := filterStopwords(tc.lang, tc.in)
   116  		require.Equal(t, tc.out, out)
   117  	}
   118  }