github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/stemmers.go (about)

     1  /*
     2   * Copyright 2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package tok
    18  
    19  import (
    20  	"github.com/blevesearch/bleve/analysis"
    21  	_ "github.com/blevesearch/bleve/analysis/lang/ar" // Needed for bleve language support.
    22  	_ "github.com/blevesearch/bleve/analysis/lang/cjk"
    23  	_ "github.com/blevesearch/bleve/analysis/lang/ckb"
    24  	_ "github.com/blevesearch/bleve/analysis/lang/da"
    25  	_ "github.com/blevesearch/bleve/analysis/lang/de"
    26  	_ "github.com/blevesearch/bleve/analysis/lang/es"
    27  	_ "github.com/blevesearch/bleve/analysis/lang/fi"
    28  	_ "github.com/blevesearch/bleve/analysis/lang/fr"
    29  	_ "github.com/blevesearch/bleve/analysis/lang/hi"
    30  	_ "github.com/blevesearch/bleve/analysis/lang/hu"
    31  	_ "github.com/blevesearch/bleve/analysis/lang/it"
    32  	_ "github.com/blevesearch/bleve/analysis/lang/nl"
    33  	_ "github.com/blevesearch/bleve/analysis/lang/no"
    34  	_ "github.com/blevesearch/bleve/analysis/lang/pt"
    35  	_ "github.com/blevesearch/bleve/analysis/lang/ro"
    36  	_ "github.com/blevesearch/bleve/analysis/lang/ru"
    37  	_ "github.com/blevesearch/bleve/analysis/lang/sv"
    38  	_ "github.com/blevesearch/bleve/analysis/lang/tr"
    39  	_ "github.com/blevesearch/bleve/analysis/token/porter"
    40  	"github.com/golang/glog"
    41  )
    42  
    43  var langStemmers = map[string]string{
    44  	"ar":  "stemmer_ar",
    45  	"ckb": "stemmer_ckb",
    46  	"da":  "stemmer_da_snowball",
    47  	"de":  "stemmer_de_light",
    48  	"en":  "stemmer_porter",
    49  	"es":  "stemmer_es_light",
    50  	"fi":  "stemmer_fi_snowball",
    51  	"fr":  "stemmer_fr_light",
    52  	"hi":  "stemmer_hi",
    53  	"hu":  "stemmer_hu_snowball",
    54  	"it":  "stemmer_it_light",
    55  	"ja":  "cjk_bigram",
    56  	"ko":  "cjk_bigram",
    57  	"nl":  "stemmer_nl_snowball",
    58  	"no":  "stemmer_no_snowball",
    59  	"pt":  "stemmer_pt_light",
    60  	"ro":  "stemmer_ro_snowball",
    61  	"ru":  "stemmer_ru_snowball",
    62  	"sv":  "stemmer_sv_snowball",
    63  	"tr":  "stemmer_tr_snowball",
    64  	"zh":  "cjk_bigram",
    65  }
    66  
    67  // filterStemmers filters stems using an existing filter, imported here.
    68  // If the lang filter is found, the we will forward requests to it.
    69  // Returns filtered tokens if filter is found, otherwise returns tokens unmodified.
    70  func filterStemmers(lang string, input analysis.TokenStream) analysis.TokenStream {
    71  	if len(input) == 0 {
    72  		return input
    73  	}
    74  	// check if we have stemmer filter for this lang.
    75  	name, ok := langStemmers[lang]
    76  	if !ok {
    77  		return input
    78  	}
    79  	// get filter from concurrent cache so we dont recreate.
    80  	filter, err := bleveCache.TokenFilterNamed(name)
    81  	if err != nil {
    82  		glog.Errorf("Error while filtering %q stems: %s", lang, err)
    83  		return input
    84  	}
    85  	if filter != nil {
    86  		return filter.Filter(input)
    87  	}
    88  	return input
    89  }