github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/bleve.go (about)

     1  /*
     2   * Copyright 2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package tok
    18  
    19  import (
    20  	"github.com/dgraph-io/dgraph/x"
    21  
    22  	"github.com/blevesearch/bleve/analysis"
    23  	"github.com/blevesearch/bleve/analysis/analyzer/custom"
    24  	"github.com/blevesearch/bleve/analysis/token/lowercase"
    25  	"github.com/blevesearch/bleve/analysis/token/unicodenorm"
    26  	"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
    27  	"github.com/blevesearch/bleve/registry"
    28  )
    29  
    30  const unicodenormName = "unicodenorm_nfkc"
    31  
    32  var (
    33  	bleveCache                     = registry.NewCache()
    34  	termAnalyzer, fulltextAnalyzer *analysis.Analyzer
    35  )
    36  
    37  // setupBleve creates bleve filters and analyzers that we use for term and fulltext tokenizers.
    38  func setupBleve() {
    39  	// unicode normalizer filter - simplifies unicode words using Normalization Form KC (NFKC)
    40  	// See: http://unicode.org/reports/tr15/#Norm_Forms
    41  	_, err := bleveCache.DefineTokenFilter(unicodenormName,
    42  		map[string]interface{}{
    43  			"type": unicodenorm.Name,
    44  			"form": unicodenorm.NFKC,
    45  		})
    46  	x.Check(err)
    47  
    48  	// term analyzer - splits on word boundaries, lowercase and normalize tokens.
    49  	termAnalyzer, err = bleveCache.DefineAnalyzer("term",
    50  		map[string]interface{}{
    51  			"type":      custom.Name,
    52  			"tokenizer": unicode.Name,
    53  			"token_filters": []string{
    54  				lowercase.Name,
    55  				unicodenormName,
    56  			},
    57  		})
    58  	x.Check(err)
    59  
    60  	// fulltext analyzer - does language stop-words removal and stemming.
    61  	fulltextAnalyzer, err = bleveCache.DefineAnalyzer("fulltext",
    62  		map[string]interface{}{
    63  			"type":      custom.Name,
    64  			"tokenizer": unicode.Name,
    65  			"token_filters": []string{
    66  				lowercase.Name,
    67  				unicodenormName,
    68  			},
    69  		})
    70  	x.Check(err)
    71  }
    72  
    73  // uniqueTerms takes a token stream and returns a string slice of unique terms.
    74  func uniqueTerms(tokens analysis.TokenStream) []string {
    75  	var terms []string
    76  	for i := range tokens {
    77  		terms = append(terms, string(tokens[i].Term))
    78  	}
    79  	terms = x.RemoveDuplicates(terms)
    80  	return terms
    81  }