github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/stopwords.go (about)

     1  /*
     2   * Copyright 2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package tok
    18  
    19  import (
    20  	"github.com/blevesearch/bleve/analysis"
    21  	_ "github.com/blevesearch/bleve/analysis/lang/ar" // Needed for bleve language support.
    22  	_ "github.com/blevesearch/bleve/analysis/lang/bg"
    23  	_ "github.com/blevesearch/bleve/analysis/lang/ca"
    24  	_ "github.com/blevesearch/bleve/analysis/lang/ckb"
    25  	_ "github.com/blevesearch/bleve/analysis/lang/cs"
    26  	_ "github.com/blevesearch/bleve/analysis/lang/da"
    27  	_ "github.com/blevesearch/bleve/analysis/lang/de"
    28  	_ "github.com/blevesearch/bleve/analysis/lang/el"
    29  	_ "github.com/blevesearch/bleve/analysis/lang/en"
    30  	_ "github.com/blevesearch/bleve/analysis/lang/es"
    31  	_ "github.com/blevesearch/bleve/analysis/lang/eu"
    32  	_ "github.com/blevesearch/bleve/analysis/lang/fa"
    33  	_ "github.com/blevesearch/bleve/analysis/lang/fi"
    34  	_ "github.com/blevesearch/bleve/analysis/lang/fr"
    35  	_ "github.com/blevesearch/bleve/analysis/lang/ga"
    36  	_ "github.com/blevesearch/bleve/analysis/lang/gl"
    37  	_ "github.com/blevesearch/bleve/analysis/lang/hi"
    38  	_ "github.com/blevesearch/bleve/analysis/lang/hu"
    39  	_ "github.com/blevesearch/bleve/analysis/lang/hy"
    40  	_ "github.com/blevesearch/bleve/analysis/lang/id"
    41  	_ "github.com/blevesearch/bleve/analysis/lang/it"
    42  	_ "github.com/blevesearch/bleve/analysis/lang/nl"
    43  	_ "github.com/blevesearch/bleve/analysis/lang/no"
    44  	_ "github.com/blevesearch/bleve/analysis/lang/pt"
    45  	_ "github.com/blevesearch/bleve/analysis/lang/ro"
    46  	_ "github.com/blevesearch/bleve/analysis/lang/ru"
    47  	_ "github.com/blevesearch/bleve/analysis/lang/sv"
    48  	_ "github.com/blevesearch/bleve/analysis/lang/tr"
    49  	"github.com/golang/glog"
    50  )
    51  
    52  var langStops = map[string]string{
    53  	"ar":  "stop_ar",
    54  	"bg":  "stop_bg",
    55  	"ca":  "stop_ca",
    56  	"ckb": "stop_ckb",
    57  	"cs":  "stop_cs",
    58  	"da":  "stop_da",
    59  	"de":  "stop_de",
    60  	"el":  "stop_el",
    61  	"en":  "stop_en",
    62  	"es":  "stop_es",
    63  	"eu":  "stop_eu",
    64  	"fa":  "stop_fa",
    65  	"fi":  "stop_fi",
    66  	"fr":  "stop_fr",
    67  	"ga":  "stop_ga",
    68  	"gl":  "stop_gl",
    69  	"hi":  "stop_hi",
    70  	"hu":  "stop_hu",
    71  	"hy":  "stop_hy",
    72  	"id":  "stop_id",
    73  	"it":  "stop_it",
    74  	"nl":  "stop_nl",
    75  	"no":  "stop_no",
    76  	"pt":  "stop_pt",
    77  	"ro":  "stop_ro",
    78  	"ru":  "stop_ru",
    79  	"sv":  "stop_sv",
    80  	"tr":  "stop_tr",
    81  }
    82  
    83  // filterStopwords filters stop words using an existing filter, imported here.
    84  // If the lang filter is found, the we will forward requests to it.
    85  // Returns filtered tokens if filter is found, otherwise returns tokens unmodified.
    86  func filterStopwords(lang string, input analysis.TokenStream) analysis.TokenStream {
    87  	if len(input) == 0 {
    88  		return input
    89  	}
    90  	// check if we have stop words filter for this lang.
    91  	name, ok := langStops[lang]
    92  	if !ok {
    93  		return input
    94  	}
    95  	// get filter from concurrent cache so we dont recreate.
    96  	filter, err := bleveCache.TokenFilterNamed(name)
    97  	if err != nil {
    98  		glog.Errorf("Error while filtering %q stop words: %s", lang, err)
    99  		return input
   100  	}
   101  	if filter != nil {
   102  		return filter.Filter(input)
   103  	}
   104  	return input
   105  }