github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/tok/bleve.go (about) 1 /* 2 * Copyright 2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package tok 18 19 import ( 20 "github.com/dgraph-io/dgraph/x" 21 22 "github.com/blevesearch/bleve/analysis" 23 "github.com/blevesearch/bleve/analysis/analyzer/custom" 24 "github.com/blevesearch/bleve/analysis/token/lowercase" 25 "github.com/blevesearch/bleve/analysis/token/unicodenorm" 26 "github.com/blevesearch/bleve/analysis/tokenizer/unicode" 27 "github.com/blevesearch/bleve/registry" 28 ) 29 30 const unicodenormName = "unicodenorm_nfkc" 31 32 var ( 33 bleveCache = registry.NewCache() 34 termAnalyzer, fulltextAnalyzer *analysis.Analyzer 35 ) 36 37 // setupBleve creates bleve filters and analyzers that we use for term and fulltext tokenizers. 38 func setupBleve() { 39 // unicode normalizer filter - simplifies unicode words using Normalization Form KC (NFKC) 40 // See: http://unicode.org/reports/tr15/#Norm_Forms 41 _, err := bleveCache.DefineTokenFilter(unicodenormName, 42 map[string]interface{}{ 43 "type": unicodenorm.Name, 44 "form": unicodenorm.NFKC, 45 }) 46 x.Check(err) 47 48 // term analyzer - splits on word boundaries, lowercase and normalize tokens. 49 termAnalyzer, err = bleveCache.DefineAnalyzer("term", 50 map[string]interface{}{ 51 "type": custom.Name, 52 "tokenizer": unicode.Name, 53 "token_filters": []string{ 54 lowercase.Name, 55 unicodenormName, 56 }, 57 }) 58 x.Check(err) 59 60 // fulltext analyzer - does language stop-words removal and stemming. 61 fulltextAnalyzer, err = bleveCache.DefineAnalyzer("fulltext", 62 map[string]interface{}{ 63 "type": custom.Name, 64 "tokenizer": unicode.Name, 65 "token_filters": []string{ 66 lowercase.Name, 67 unicodenormName, 68 }, 69 }) 70 x.Check(err) 71 } 72 73 // uniqueTerms takes a token stream and returns a string slice of unique terms. 74 func uniqueTerms(tokens analysis.TokenStream) []string { 75 var terms []string 76 for i := range tokens { 77 terms = append(terms, string(tokens[i].Term)) 78 } 79 terms = x.RemoveDuplicates(terms) 80 return terms 81 }